YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/tserver/ts_tablet_manager.cc
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// The following only applies to changes made to this file as part of YugaByte development.
19
//
20
// Portions Copyright (c) YugaByte, Inc.
21
//
22
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
23
// in compliance with the License.  You may obtain a copy of the License at
24
//
25
// http://www.apache.org/licenses/LICENSE-2.0
26
//
27
// Unless required by applicable law or agreed to in writing, software distributed under the License
28
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
29
// or implied.  See the License for the specific language governing permissions and limitations
30
// under the License.
31
//
32
33
#include "yb/tserver/ts_tablet_manager.h"
34
35
#include <algorithm>
36
#include <chrono>
37
#include <memory>
38
#include <mutex>
39
#include <string>
40
#include <thread>
41
#include <vector>
42
43
#include <boost/container/static_vector.hpp>
44
#include <boost/optional/optional.hpp>
45
#include <glog/logging.h>
46
47
#include "yb/client/client.h"
48
49
#include "yb/common/wire_protocol.h"
50
51
#include "yb/consensus/consensus.h"
52
#include "yb/consensus/multi_raft_batcher.h"
53
#include "yb/consensus/consensus_meta.h"
54
#include "yb/consensus/log.h"
55
#include "yb/consensus/log_anchor_registry.h"
56
#include "yb/consensus/metadata.pb.h"
57
#include "yb/consensus/opid_util.h"
58
#include "yb/consensus/quorum_util.h"
59
#include "yb/consensus/raft_consensus.h"
60
#include "yb/consensus/retryable_requests.h"
61
#include "yb/consensus/state_change_context.h"
62
63
#include "yb/docdb/docdb_rocksdb_util.h"
64
65
#include "yb/fs/fs_manager.h"
66
67
#include "yb/gutil/bind.h"
68
#include "yb/gutil/stl_util.h"
69
#include "yb/gutil/strings/substitute.h"
70
#include "yb/gutil/sysinfo.h"
71
72
#include "yb/master/master_heartbeat.pb.h"
73
#include "yb/master/sys_catalog.h"
74
75
#include "yb/rpc/messenger.h"
76
#include "yb/rpc/poller.h"
77
78
#include "yb/tablet/metadata.pb.h"
79
#include "yb/tablet/operations/split_operation.h"
80
#include "yb/tablet/tablet.h"
81
#include "yb/tablet/tablet.pb.h"
82
#include "yb/tablet/tablet_bootstrap_if.h"
83
#include "yb/tablet/tablet_metadata.h"
84
#include "yb/tablet/tablet_options.h"
85
#include "yb/tablet/tablet_peer.h"
86
87
#include "yb/tserver/heartbeater.h"
88
#include "yb/tserver/remote_bootstrap_client.h"
89
#include "yb/tserver/remote_bootstrap_session.h"
90
#include "yb/tserver/tablet_server.h"
91
#include "yb/tserver/tserver.pb.h"
92
93
#include "yb/util/debug/long_operation_tracker.h"
94
#include "yb/util/debug/trace_event.h"
95
#include "yb/util/env.h"
96
#include "yb/util/fault_injection.h"
97
#include "yb/util/flag_tags.h"
98
#include "yb/util/format.h"
99
#include "yb/util/logging.h"
100
#include "yb/util/mem_tracker.h"
101
#include "yb/util/metrics.h"
102
#include "yb/util/pb_util.h"
103
#include "yb/util/scope_exit.h"
104
#include "yb/util/shared_lock.h"
105
#include "yb/util/status_format.h"
106
#include "yb/util/status_log.h"
107
#include "yb/util/stopwatch.h"
108
#include "yb/util/trace.h"
109
110
using namespace std::literals;
111
using namespace std::placeholders;
112
113
DEFINE_int32(num_tablets_to_open_simultaneously, 0,
114
             "Number of threads available to open tablets during startup. If this "
115
             "is set to 0 (the default), then the number of bootstrap threads will "
116
             "be set based on the number of data directories. If the data directories "
117
             "are on some very fast storage device such as SSD or a RAID array, it "
118
             "may make sense to manually tune this.");
119
TAG_FLAG(num_tablets_to_open_simultaneously, advanced);
120
121
DEFINE_int32(tablet_start_warn_threshold_ms, 500,
122
             "If a tablet takes more than this number of millis to start, issue "
123
             "a warning with a trace.");
124
TAG_FLAG(tablet_start_warn_threshold_ms, hidden);
125
126
DEFINE_int32(cleanup_split_tablets_interval_sec, 60,
127
             "Interval at which tablet manager tries to cleanup split tablets which are no longer "
128
             "needed. Setting this to 0 disables cleanup of split tablets.");
129
130
DEFINE_test_flag(double, fault_crash_after_blocks_deleted, 0.0,
131
                 "Fraction of the time when the tablet will crash immediately "
132
                 "after deleting the data blocks during tablet deletion.");
133
134
DEFINE_test_flag(double, fault_crash_after_wal_deleted, 0.0,
135
                 "Fraction of the time when the tablet will crash immediately "
136
                 "after deleting the WAL segments during tablet deletion.");
137
138
DEFINE_test_flag(double, fault_crash_after_cmeta_deleted, 0.0,
139
                 "Fraction of the time when the tablet will crash immediately "
140
                 "after deleting the consensus metadata during tablet deletion.");
141
142
DEFINE_test_flag(double, fault_crash_after_rb_files_fetched, 0.0,
143
                 "Fraction of the time when the tablet will crash immediately "
144
                 "after fetching the files during a remote bootstrap but before "
145
                 "marking the superblock as TABLET_DATA_READY.");
146
147
DEFINE_test_flag(double, fault_crash_in_split_after_log_copied, 0.0,
148
                 "Fraction of the time when the tablet will crash immediately after initiating a "
149
                 "Log::CopyTo from parent to child tablet, but before marking the child tablet as "
150
                 "TABLET_DATA_READY.");
151
152
DEFINE_test_flag(bool, simulate_already_present_in_remote_bootstrap, false,
153
                 "If true, return an AlreadyPresent error in remote bootstrap after starting the "
154
                 "remote bootstrap client.");
155
156
DEFINE_test_flag(double, fault_crash_in_split_before_log_flushed, 0.0,
157
                 "Fraction of the time when the tablet will crash immediately before flushing a "
158
                 "parent tablet's kSplit operation.");
159
160
DEFINE_test_flag(uint64, crash_if_remote_bootstrap_sessions_greater_than, 0,
161
                 "If greater than zero, this process will crash if we detect more than the "
162
                 "specified number of remote bootstrap sessions.");
163
164
DEFINE_test_flag(uint64, crash_if_remote_bootstrap_sessions_per_table_greater_than, 0,
165
                 "If greater than zero, this process will crash if for any table we exceed the "
166
                 "specified number of remote bootstrap sessions");
167
168
DEFINE_test_flag(bool, crash_before_apply_tablet_split_op, false,
169
                 "Crash inside TSTabletManager::ApplyTabletSplit before doing anything");
170
171
DEFINE_test_flag(bool, force_single_tablet_failure, false,
172
                 "Force exactly one tablet to a failed state.");
173
174
DEFINE_test_flag(int32, apply_tablet_split_inject_delay_ms, 0,
175
                 "Inject delay into TSTabletManager::ApplyTabletSplit.");
176
177
DEFINE_test_flag(bool, skip_deleting_split_tablets, false,
178
                 "Skip deleting tablets which have been split.");
179
180
DEFINE_test_flag(bool, skip_post_split_compaction, false,
181
                 "Skip processing post split compaction.");
182
183
DEFINE_int32(verify_tablet_data_interval_sec, 0,
184
             "The tick interval time for the tablet data integrity verification background task. "
185
             "This defaults to 0, which means disable the background task.");
186
187
DEFINE_int32(cleanup_metrics_interval_sec, 60,
188
             "The tick interval time for the metrics cleanup background task. "
189
             "If set to 0, it disables the background task.");
190
191
DEFINE_bool(skip_tablet_data_verification, false,
192
            "Skip checking tablet data for corruption.");
193
194
DEFINE_int32(read_pool_max_threads, 128,
195
             "The maximum number of threads allowed for read_pool_. This pool is used "
196
             "to run multiple read operations, that are part of the same tablet rpc, "
197
             "in parallel.");
198
DEFINE_int32(read_pool_max_queue_size, 128,
199
             "The maximum number of tasks that can be held in the queue for read_pool_. This pool "
200
             "is used to run multiple read operations, that are part of the same tablet rpc, "
201
             "in parallel.");
202
203
DEFINE_int32(post_split_trigger_compaction_pool_max_threads, 1,
204
             "The maximum number of threads allowed for post_split_trigger_compaction_pool_. This "
205
             "pool is used to run compactions on tablets after they have been split and still "
206
             "contain irrelevant data from the tablet they were sourced from.");
207
DEFINE_int32(post_split_trigger_compaction_pool_max_queue_size, 16,
208
             "The maximum number of tasks that can be held in the pool for "
209
             "post_split_trigger_compaction_pool_. This pool is used to run compactions on tablets "
210
             "after they have been split and still contain irrelevant data from the tablet they "
211
             "were sourced from.");
212
213
DEFINE_test_flag(int32, sleep_after_tombstoning_tablet_secs, 0,
214
                 "Whether we sleep in LogAndTombstone after calling DeleteTabletData.");
215
216
constexpr int kTServerYbClientDefaultTimeoutMs = 60 * 1000;
217
218
DEFINE_int32(tserver_yb_client_default_timeout_ms, kTServerYbClientDefaultTimeoutMs,
219
             "Default timeout for the YBClient embedded into the tablet server that is used "
220
             "for distributed transactions.");
221
222
DEFINE_bool(enable_restart_transaction_status_tablets_first, true,
223
            "Set to true to prioritize bootstrapping transaction status tablets first.");
224
225
DECLARE_string(rocksdb_compact_flush_rate_limit_sharing_mode);
226
227
namespace yb {
228
namespace tserver {
229
230
METRIC_DEFINE_coarse_histogram(server, op_apply_queue_length, "Operation Apply Queue Length",
231
                        MetricUnit::kTasks,
232
                        "Number of operations waiting to be applied to the tablet. "
233
                        "High queue lengths indicate that the server is unable to process "
234
                        "operations as fast as they are being written to the WAL.");
235
236
METRIC_DEFINE_coarse_histogram(server, op_apply_queue_time, "Operation Apply Queue Time",
237
                        MetricUnit::kMicroseconds,
238
                        "Time that operations spent waiting in the apply queue before being "
239
                        "processed. High queue times indicate that the server is unable to "
240
                        "process operations as fast as they are being written to the WAL.");
241
242
METRIC_DEFINE_coarse_histogram(server, op_apply_run_time, "Operation Apply Run Time",
243
                        MetricUnit::kMicroseconds,
244
                        "Time that operations spent being applied to the tablet. "
245
                        "High values may indicate that the server is under-provisioned or "
246
                        "that operations consist of very large batches.");
247
248
METRIC_DEFINE_coarse_histogram(server, op_read_queue_length, "Operation Read op Queue Length",
249
                        MetricUnit::kTasks,
250
                        "Number of operations waiting to be applied to the tablet. "
251
                            "High queue lengths indicate that the server is unable to process "
252
                            "operations as fast as they are being written to the WAL.");
253
254
METRIC_DEFINE_coarse_histogram(server, op_read_queue_time, "Operation Read op Queue Time",
255
                        MetricUnit::kMicroseconds,
256
                        "Time that operations spent waiting in the read queue before being "
257
                            "processed. High queue times indicate that the server is unable to "
258
                            "process operations as fast as they are being written to the WAL.");
259
260
METRIC_DEFINE_coarse_histogram(server, op_read_run_time, "Operation Read op Run Time",
261
                        MetricUnit::kMicroseconds,
262
                        "Time that operations spent being applied to the tablet. "
263
                            "High values may indicate that the server is under-provisioned or "
264
                            "that operations consist of very large batches.");
265
266
METRIC_DEFINE_coarse_histogram(server, ts_bootstrap_time, "TServer Bootstrap Time",
267
                        MetricUnit::kMicroseconds,
268
                        "Time that the tablet server takes to bootstrap all of its tablets.");
269
270
THREAD_POOL_METRICS_DEFINE(
271
    server, post_split_trigger_compaction_pool, "Thread pool for tablet compaction jobs.");
272
273
THREAD_POOL_METRICS_DEFINE(
274
    server, admin_triggered_compaction_pool, "Thread pool for tablet compaction jobs.");
275
276
using consensus::ConsensusMetadata;
277
using consensus::ConsensusStatePB;
278
using consensus::RaftConfigPB;
279
using consensus::RaftPeerPB;
280
using consensus::StartRemoteBootstrapRequestPB;
281
using log::Log;
282
using master::ReportedTabletPB;
283
using master::TabletReportPB;
284
using master::TabletReportUpdatesPB;
285
using std::shared_ptr;
286
using std::string;
287
using std::unordered_set;
288
using std::vector;
289
using strings::Substitute;
290
using tablet::BOOTSTRAPPING;
291
using tablet::NOT_STARTED;
292
using tablet::RaftGroupMetadata;
293
using tablet::RaftGroupMetadataPtr;
294
using tablet::RaftGroupStatePB;
295
using tablet::RUNNING;
296
using tablet::TABLET_DATA_COPYING;
297
using tablet::TABLET_DATA_DELETED;
298
using tablet::TABLET_DATA_INIT_STARTED;
299
using tablet::TABLET_DATA_READY;
300
using tablet::TABLET_DATA_SPLIT_COMPLETED;
301
using tablet::TABLET_DATA_TOMBSTONED;
302
using tablet::TabletDataState;
303
using tablet::TabletPeer;
304
using tablet::TabletPeerPtr;
305
using tablet::TabletStatusListener;
306
using tablet::TabletStatusPB;
307
308
constexpr int32_t kDefaultTserverBlockCacheSizePercentage = 50;
309
310
0
void TSTabletManager::VerifyTabletData() {
311
0
  LOG_WITH_PREFIX(INFO) << "Beginning tablet data verification checks";
312
0
  for (const TabletPeerPtr& peer : GetTabletPeers()) {
313
0
    if (peer->state() == RUNNING) {
314
0
      if (PREDICT_FALSE(FLAGS_skip_tablet_data_verification)) {
315
0
        LOG_WITH_PREFIX(INFO)
316
0
            << Format("Skipped tablet data verification check on $0", peer->tablet_id());
317
0
      } else {
318
0
        Status s = peer->tablet()->VerifyDataIntegrity();
319
0
        if (!s.ok()) {
320
0
          LOG(WARNING) << "Tablet data integrity verification failed on " << peer->tablet_id()
321
0
                       << ": " << s;
322
0
        }
323
0
      }
324
0
    }
325
0
  }
326
0
}
327
328
1.72k
void TSTabletManager::CleanupOldMetrics() {
329
0
  VLOG(2) << "Cleaning up old metrics";
330
1.72k
  metric_registry_->RetireOldMetrics();
331
1.72k
}
332
333
TSTabletManager::TSTabletManager(FsManager* fs_manager,
334
                                 TabletServer* server,
335
                                 MetricRegistry* metric_registry)
336
  : fs_manager_(fs_manager),
337
    server_(server),
338
    metric_registry_(metric_registry),
339
6.10k
    state_(MANAGER_INITIALIZING) {
340
6.10k
  ThreadPoolMetrics metrics = {
341
6.10k
      METRIC_op_apply_queue_length.Instantiate(server_->metric_entity()),
342
6.10k
      METRIC_op_apply_queue_time.Instantiate(server_->metric_entity()),
343
6.10k
      METRIC_op_apply_run_time.Instantiate(server_->metric_entity())
344
6.10k
  };
345
6.10k
  tablet_options_.ServerMetricEntity = server_->metric_entity();
346
6.10k
  CHECK_OK(ThreadPoolBuilder("apply")
347
6.10k
               .set_metrics(std::move(metrics))
348
6.10k
               .Build(&apply_pool_));
349
350
  // This pool is shared by all replicas hosted by this server.
351
  //
352
  // Some submitted tasks use blocking IO, so we configure no upper bound on
353
  // the maximum number of threads in each pool (otherwise the default value of
354
  // "number of CPUs" may cause blocking tasks to starve other "fast" tasks).
355
  // However, the effective upper bound is the number of replicas as each will
356
  // submit its own tasks via a dedicated token.
357
6.10k
  CHECK_OK(ThreadPoolBuilder("consensus")
358
6.10k
               .set_min_threads(1)
359
6.10k
               .unlimited_threads()
360
6.10k
               .Build(&raft_pool_));
361
6.10k
  CHECK_OK(ThreadPoolBuilder("prepare")
362
6.10k
               .set_min_threads(1)
363
6.10k
               .unlimited_threads()
364
6.10k
               .Build(&tablet_prepare_pool_));
365
6.10k
  CHECK_OK(ThreadPoolBuilder("append")
366
6.10k
               .set_min_threads(1)
367
6.10k
               .unlimited_threads()
368
6.10k
               .set_idle_timeout(MonoDelta::FromMilliseconds(10000))
369
6.10k
               .Build(&append_pool_));
370
6.10k
  CHECK_OK(ThreadPoolBuilder("log-alloc")
371
6.10k
               .set_min_threads(1)
372
6.10k
               .unlimited_threads()
373
6.10k
               .Build(&allocation_pool_));
374
6.10k
  ThreadPoolMetrics read_metrics = {
375
6.10k
      METRIC_op_read_queue_length.Instantiate(server_->metric_entity()),
376
6.10k
      METRIC_op_read_queue_time.Instantiate(server_->metric_entity()),
377
6.10k
      METRIC_op_read_run_time.Instantiate(server_->metric_entity())
378
6.10k
  };
379
6.10k
  CHECK_OK(ThreadPoolBuilder("read-parallel")
380
6.10k
               .set_max_threads(FLAGS_read_pool_max_threads)
381
6.10k
               .set_max_queue_size(FLAGS_read_pool_max_queue_size)
382
6.10k
               .set_metrics(std::move(read_metrics))
383
6.10k
               .Build(&read_pool_));
384
6.10k
  CHECK_OK(ThreadPoolBuilder("tablet-split-compaction")
385
6.10k
              .set_max_threads(FLAGS_post_split_trigger_compaction_pool_max_threads)
386
6.10k
              .set_max_queue_size(FLAGS_post_split_trigger_compaction_pool_max_queue_size)
387
6.10k
              .set_metrics(THREAD_POOL_METRICS_INSTANCE(
388
6.10k
                  server_->metric_entity(), post_split_trigger_compaction_pool))
389
6.10k
              .Build(&post_split_trigger_compaction_pool_));
390
6.10k
  CHECK_OK(ThreadPoolBuilder("admin-compaction")
391
6.10k
              .set_max_threads(std::max(docdb::GetGlobalRocksDBPriorityThreadPoolSize(), 0))
392
6.10k
              .set_metrics(THREAD_POOL_METRICS_INSTANCE(
393
6.10k
                  server_->metric_entity(), admin_triggered_compaction_pool))
394
6.10k
              .Build(&admin_triggered_compaction_pool_));
395
396
6.10k
  mem_manager_ = std::make_shared<TabletMemoryManager>(
397
6.10k
      &tablet_options_,
398
6.10k
      server_->mem_tracker(),
399
6.10k
      kDefaultTserverBlockCacheSizePercentage,
400
6.10k
      server_->metric_entity(),
401
88
      [this](){ return GetTabletPeers(); });
402
6.10k
}
403
404
111
TSTabletManager::~TSTabletManager() {
405
111
}
406
407
5.81k
Status TSTabletManager::Init() {
408
5.81k
  CHECK_EQ(state(), MANAGER_INITIALIZING);
409
410
5.81k
  async_client_init_.emplace(
411
5.81k
      "tserver_client", 0 /* num_reactors */,
412
5.81k
      FLAGS_tserver_yb_client_default_timeout_ms / 1000, server_->permanent_uuid(),
413
5.81k
      &server_->options(), server_->metric_entity(), server_->mem_tracker(),
414
5.81k
      server_->messenger());
415
416
5.48k
  async_client_init_->AddPostCreateHook([this](client::YBClient* client) {
417
5.48k
    auto* tserver = server();
418
5.48k
    if (tserver != nullptr && tserver->proxy() != nullptr) {
419
5.48k
      client->SetLocalTabletServer(tserver->permanent_uuid(), tserver->proxy(), tserver);
420
5.48k
    }
421
5.48k
  });
422
423
5.81k
  tablet_options_.env = server_->GetEnv();
424
5.81k
  tablet_options_.rocksdb_env = server_->GetRocksDBEnv();
425
5.81k
  tablet_options_.listeners = server_->options().listeners;
426
5.81k
  if (docdb::GetRocksDBRateLimiterSharingMode() == docdb::RateLimiterSharingMode::TSERVER) {
427
4.66k
    tablet_options_.rate_limiter = docdb::CreateRocksDBRateLimiter();
428
4.66k
  }
429
430
  // Start the threadpool we'll use to open tablets.
431
  // This has to be done in Init() instead of the constructor, since the
432
  // FsManager isn't initialized until this point.
433
5.81k
  int max_bootstrap_threads = FLAGS_num_tablets_to_open_simultaneously;
434
5.81k
  if (max_bootstrap_threads == 0) {
435
5.81k
    int num_cpus = base::NumCPUs();
436
5.81k
    if (num_cpus <= 2) {
437
0
      max_bootstrap_threads = 2;
438
5.81k
    } else {
439
5.81k
      max_bootstrap_threads = min(
440
5.81k
          num_cpus - 1, narrow_cast<int>(fs_manager_->GetDataRootDirs().size()) * 8);
441
5.81k
    }
442
5.81k
    LOG_WITH_PREFIX(INFO) <<  "max_bootstrap_threads=" << max_bootstrap_threads;
443
5.81k
  }
444
5.81k
  ThreadPoolMetrics bootstrap_metrics = {
445
5.81k
          nullptr,
446
5.81k
          nullptr,
447
5.81k
          METRIC_ts_bootstrap_time.Instantiate(server_->metric_entity())
448
5.81k
  };
449
5.81k
  RETURN_NOT_OK(ThreadPoolBuilder("tablet-bootstrap")
450
5.81k
                .set_max_threads(max_bootstrap_threads)
451
5.81k
                .set_metrics(std::move(bootstrap_metrics))
452
5.81k
                .Build(&open_tablet_pool_));
453
454
5.81k
  CleanupCheckpoints();
455
456
  // Search for tablets in the metadata dir.
457
5.81k
  vector<string> tablet_ids;
458
5.81k
  RETURN_NOT_OK(fs_manager_->ListTabletIds(&tablet_ids));
459
460
5.81k
  InitLocalRaftPeerPB();
461
462
5.81k
  multi_raft_manager_ = std::make_unique<consensus::MultiRaftManager>(server_->messenger(),
463
5.81k
                                                                      &server_->proxy_cache(),
464
5.81k
                                                                      local_peer_pb_.cloud_info());
465
466
5.81k
  deque<RaftGroupMetadataPtr> metas;
467
468
  // First, load all of the tablet metadata. We do this before we start
469
  // submitting the actual OpenTablet() tasks so that we don't have to compete
470
  // for disk resources, etc, with bootstrap processes and running tablets.
471
5.81k
  MonoTime start(MonoTime::Now());
472
228
  for (const string& tablet_id : tablet_ids) {
473
228
    RaftGroupMetadataPtr meta;
474
228
    RETURN_NOT_OK_PREPEND(OpenTabletMeta(tablet_id, &meta),
475
228
                          "Failed to open tablet metadata for tablet: " + tablet_id);
476
228
    if (PREDICT_FALSE(!CanServeTabletData(meta->tablet_data_state()))) {
477
9
      RETURN_NOT_OK(HandleNonReadyTabletOnStartup(meta));
478
9
      continue;
479
219
    }
480
219
    RegisterDataAndWalDir(
481
219
        fs_manager_, meta->table_id(), meta->raft_group_id(), meta->data_root_dir(),
482
219
        meta->wal_root_dir());
483
219
    if (FLAGS_enable_restart_transaction_status_tablets_first) {
484
      // Prioritize bootstrapping transaction status tablets first.
485
219
      if (meta->table_type() == TRANSACTION_STATUS_TABLE_TYPE) {
486
27
        metas.push_front(meta);
487
192
      } else {
488
192
        metas.push_back(meta);
489
192
      }
490
0
    } else {
491
0
      metas.push_back(meta);
492
0
    }
493
219
  }
494
495
5.81k
  MonoDelta elapsed = MonoTime::Now().GetDeltaSince(start);
496
5.81k
  LOG(INFO) << "Loaded metadata for " << tablet_ids.size() << " tablet in "
497
5.81k
            << elapsed.ToMilliseconds() << " ms";
498
499
  // Now submit the "Open" task for each.
500
219
  for (const RaftGroupMetadataPtr& meta : metas) {
501
219
    scoped_refptr<TransitionInProgressDeleter> deleter;
502
219
    RETURN_NOT_OK(StartTabletStateTransition(
503
219
        meta->raft_group_id(), "opening tablet", &deleter));
504
505
219
    TabletPeerPtr tablet_peer = VERIFY_RESULT(CreateAndRegisterTabletPeer(meta, NEW_PEER));
506
219
    RETURN_NOT_OK(open_tablet_pool_->SubmitFunc(
507
219
        std::bind(&TSTabletManager::OpenTablet, this, meta, deleter)));
508
219
  }
509
510
5.81k
  {
511
5.81k
    std::lock_guard<RWMutex> lock(mutex_);
512
5.81k
    state_ = MANAGER_RUNNING;
513
5.81k
  }
514
515
5.81k
  RETURN_NOT_OK(mem_manager_->Init());
516
517
5.81k
  tablets_cleaner_ = std::make_unique<rpc::Poller>(
518
5.81k
      LogPrefix(), std::bind(&TSTabletManager::CleanupSplitTablets, this));
519
520
5.81k
  verify_tablet_data_poller_ = std::make_unique<rpc::Poller>(
521
5.81k
      LogPrefix(), std::bind(&TSTabletManager::VerifyTabletData, this));
522
523
5.81k
  metrics_cleaner_ = std::make_unique<rpc::Poller>(
524
5.81k
      LogPrefix(), std::bind(&TSTabletManager::CleanupOldMetrics, this));
525
526
5.81k
  return Status::OK();
527
5.81k
}
528
529
5.81k
void TSTabletManager::CleanupCheckpoints() {
530
5.91k
  for (const auto& data_root : fs_manager_->GetDataRootDirs()) {
531
5.91k
    auto tables_dir = JoinPathSegments(data_root, FsManager::kRocksDBDirName);
532
5.91k
    auto tables = fs_manager_->env()->GetChildren(tables_dir, ExcludeDots::kTrue);
533
5.91k
    if (!tables.ok()) {
534
0
      LOG_WITH_PREFIX(WARNING)
535
0
          << "Failed to get tables in " << tables_dir << ": " << tables.status();
536
0
      continue;
537
0
    }
538
5.91k
    for (const auto& table : *tables) {
539
95
      auto table_dir = JoinPathSegments(tables_dir, table);
540
95
      auto tablets = fs_manager_->env()->GetChildren(table_dir, ExcludeDots::kTrue);
541
95
      if (!tablets.ok()) {
542
0
        LOG_WITH_PREFIX(WARNING)
543
0
            << "Failed to get tablets in " << table_dir << ": " << tablets.status();
544
0
        continue;
545
0
      }
546
578
      for (const auto& tablet : *tablets) {
547
578
        auto checkpoints_dir = JoinPathSegments(
548
578
            table_dir, tablet, RemoteBootstrapSession::kCheckpointsDir);
549
578
        if (fs_manager_->env()->FileExists(checkpoints_dir)) {
550
2
          LOG_WITH_PREFIX(INFO) << "Cleaning up checkpoints dir: " << yb::ToString(checkpoints_dir);
551
2
          auto status = fs_manager_->env()->DeleteRecursively(checkpoints_dir);
552
2
          WARN_NOT_OK(status, Format("Cleanup of checkpoints dir $0 failed", checkpoints_dir));
553
2
        }
554
578
      }
555
95
    }
556
5.91k
  }
557
5.81k
}
558
559
5.80k
Status TSTabletManager::Start() {
560
5.80k
  async_client_init_->Start();
561
5.80k
  if (FLAGS_cleanup_split_tablets_interval_sec > 0) {
562
5.80k
    tablets_cleaner_->Start(
563
5.80k
        &server_->messenger()->scheduler(), FLAGS_cleanup_split_tablets_interval_sec * 1s);
564
5.80k
    LOG(INFO) << "Split tablets cleanup monitor started...";
565
0
  } else {
566
0
    LOG(INFO)
567
0
        << "Split tablets cleanup is disabled by cleanup_split_tablets_interval_sec flag set to 0";
568
0
  }
569
5.80k
  if (FLAGS_verify_tablet_data_interval_sec > 0) {
570
0
    verify_tablet_data_poller_->Start(
571
0
        &server_->messenger()->scheduler(), FLAGS_verify_tablet_data_interval_sec * 1s);
572
0
    LOG(INFO) << "Tablet data verification task started...";
573
5.80k
  } else {
574
5.80k
    LOG(INFO)
575
5.80k
        << "Tablet data verification is disabled by verify_tablet_data_interval_sec flag set to 0";
576
5.80k
  }
577
5.80k
  if (FLAGS_cleanup_metrics_interval_sec > 0) {
578
5.80k
    metrics_cleaner_->Start(
579
5.80k
        &server_->messenger()->scheduler(), FLAGS_cleanup_metrics_interval_sec * 1s);
580
5.80k
    LOG(INFO) << "Old metrics cleanup task started...";
581
0
  } else {
582
0
    LOG(INFO)
583
0
        << "Old metrics cleanup is disabled by cleanup_metrics_interval_sec flag set to 0";
584
0
  }
585
586
5.80k
  return Status::OK();
587
5.80k
}
588
589
3.73k
void TSTabletManager::CleanupSplitTablets() {
590
0
  VLOG_WITH_PREFIX_AND_FUNC(3) << "looking for tablets to cleanup...";
591
3.73k
  auto tablet_peers = GetTabletPeers();
592
24.1k
  for (const auto& tablet_peer : tablet_peers) {
593
24.1k
    if (tablet_peer->CanBeDeleted()) {
594
15
      const auto& tablet_id = tablet_peer->tablet_id();
595
15
      if (PREDICT_FALSE(FLAGS_TEST_skip_deleting_split_tablets)) {
596
7
        LOG_WITH_PREFIX(INFO) << Format("Skipped triggering delete of tablet $0", tablet_id);
597
8
      } else {
598
8
        LOG_WITH_PREFIX(INFO) << Format("Triggering delete of tablet $0", tablet_id);
599
8
        client().DeleteNotServingTablet(
600
8
            tablet_peer->tablet_id(), [tablet_id](const Status& status) {
601
8
              LOG(INFO) << Format("Tablet $0 deletion result: $1", tablet_id, status);
602
8
            });
603
8
      }
604
15
    }
605
24.1k
  }
606
3.73k
}
607
608
14
Status TSTabletManager::WaitForAllBootstrapsToFinish() {
609
14
  CHECK_EQ(state(), MANAGER_RUNNING);
610
611
14
  open_tablet_pool_->Wait();
612
613
14
  Status s = Status::OK();
614
615
14
  SharedLock<RWMutex> shared_lock(mutex_);
616
29
  for (const TabletMap::value_type& entry : tablet_map_) {
617
29
    if (entry.second->state() == tablet::FAILED) {
618
1
      if (s.ok()) {
619
1
        s = entry.second->error();
620
1
      }
621
1
    }
622
29
  }
623
624
14
  return s;
625
14
}
626
627
Result<scoped_refptr<TransitionInProgressDeleter>>
628
81.9k
TSTabletManager::StartTabletStateTransitionForCreation(const TabletId& tablet_id) {
629
81.9k
  scoped_refptr<TransitionInProgressDeleter> deleter;
630
81.9k
  SharedLock<RWMutex> lock(mutex_);
631
81.9k
  TRACE("Acquired tablet manager lock");
632
633
  // Sanity check that the tablet isn't already registered.
634
81.9k
  TabletPeerPtr junk;
635
81.9k
  if (LookupTabletUnlocked(tablet_id, &junk)) {
636
4
    return STATUS(AlreadyPresent, "Tablet already registered", tablet_id);
637
4
  }
638
639
81.9k
  RETURN_NOT_OK(StartTabletStateTransition(tablet_id, "creating tablet", &deleter));
640
641
81.9k
  return deleter;
642
81.9k
}
643
644
Result<TabletPeerPtr> TSTabletManager::CreateNewTablet(
645
    const tablet::TableInfoPtr& table_info,
646
    const string& tablet_id,
647
    const Partition& partition,
648
    RaftConfigPB config,
649
    const bool colocated,
650
82.1k
    const std::vector<SnapshotScheduleId>& snapshot_schedules) {
651
82.1k
  if (state() != MANAGER_RUNNING) {
652
0
    return STATUS_FORMAT(IllegalState, "Manager is not running: $0", state());
653
0
  }
654
82.1k
  CHECK(IsRaftConfigMember(server_->instance_pb().permanent_uuid(), config));
655
656
326k
  for (int i = 0; i < config.peers_size(); ++i) {
657
244k
    const auto& config_peer = config.peers(i);
658
244k
    CHECK(config_peer.has_member_type());
659
244k
  }
660
661
  // Set the initial opid_index for a RaftConfigPB to -1.
662
82.1k
  config.set_opid_index(consensus::kInvalidOpIdIndex);
663
664
82.1k
  scoped_refptr<TransitionInProgressDeleter> deleter =
665
82.1k
      VERIFY_RESULT(StartTabletStateTransitionForCreation(tablet_id));
666
667
  // Create the metadata.
668
82.1k
  TRACE("Creating new metadata...");
669
82.1k
  string data_root_dir;
670
82.1k
  string wal_root_dir;
671
82.1k
  GetAndRegisterDataAndWalDir(
672
82.1k
      fs_manager_, table_info->table_id, tablet_id, &data_root_dir, &wal_root_dir);
673
82.1k
  auto create_result = RaftGroupMetadata::CreateNew(tablet::RaftGroupMetadataData {
674
82.1k
    .fs_manager = fs_manager_,
675
82.1k
    .table_info = table_info,
676
82.1k
    .raft_group_id = tablet_id,
677
82.1k
    .partition = partition,
678
82.1k
    .tablet_data_state = TABLET_DATA_READY,
679
82.1k
    .colocated = colocated,
680
82.1k
    .snapshot_schedules = snapshot_schedules,
681
82.1k
  }, data_root_dir, wal_root_dir);
682
82.1k
  if (!create_result.ok()) {
683
2
    UnregisterDataWalDir(table_info->table_id, tablet_id, data_root_dir, wal_root_dir);
684
2
  }
685
82.1k
  RETURN_NOT_OK_PREPEND(create_result, "Couldn't create tablet metadata")
686
82.1k
  RaftGroupMetadataPtr meta = std::move(*create_result);
687
82.1k
  LOG(INFO) << TabletLogPrefix(tablet_id)
688
82.1k
            << "Created tablet metadata for table: " << table_info->table_id;
689
690
  // We must persist the consensus metadata to disk before starting a new
691
  // tablet's TabletPeer and Consensus implementation.
692
82.1k
  std::unique_ptr<ConsensusMetadata> cmeta;
693
82.1k
  RETURN_NOT_OK_PREPEND(ConsensusMetadata::Create(fs_manager_, tablet_id, fs_manager_->uuid(),
694
82.1k
                                                  config, consensus::kMinimumTerm, &cmeta),
695
82.1k
                        "Unable to create new ConsensusMeta for tablet " + tablet_id);
696
82.1k
  TabletPeerPtr new_peer = VERIFY_RESULT(CreateAndRegisterTabletPeer(meta, NEW_PEER));
697
698
  // We can run this synchronously since there is nothing to bootstrap.
699
82.1k
  RETURN_NOT_OK(
700
82.1k
      open_tablet_pool_->SubmitFunc(std::bind(&TSTabletManager::OpenTablet, this, meta, deleter)));
701
702
82.1k
  return new_peer;
703
82.1k
}
704
705
struct TabletCreationMetaData {
706
  TabletId tablet_id;
707
  scoped_refptr<TransitionInProgressDeleter> transition_deleter;
708
  Partition partition;
709
  docdb::KeyBounds key_bounds;
710
  RaftGroupMetadataPtr raft_group_metadata;
711
};
712
713
namespace {
714
715
// Creates SplitTabletsCreationMetaData for two new tablets for `tablet` splitting based on request.
716
SplitTabletsCreationMetaData PrepareTabletCreationMetaDataForSplit(
717
44
    const tablet::SplitTabletRequestPB& request, const tablet::Tablet& tablet) {
718
44
  SplitTabletsCreationMetaData metas;
719
720
44
  const auto& split_partition_key = request.split_partition_key();
721
44
  const auto& split_encoded_key = request.split_encoded_key();
722
723
44
  std::shared_ptr<Partition> source_partition = tablet.metadata()->partition();
724
44
  const auto source_key_bounds = *tablet.doc_db().key_bounds;
725
726
44
  {
727
44
    TabletCreationMetaData meta;
728
44
    meta.tablet_id = request.new_tablet1_id();
729
44
    meta.partition = *source_partition;
730
44
    meta.key_bounds = source_key_bounds;
731
44
    meta.partition.set_partition_key_end(split_partition_key);
732
44
    meta.key_bounds.upper.Reset(split_encoded_key);
733
44
    metas.push_back(meta);
734
44
  }
735
736
44
  {
737
44
    TabletCreationMetaData meta;
738
44
    meta.tablet_id = request.new_tablet2_id();
739
44
    meta.partition = *source_partition;
740
44
    meta.key_bounds = source_key_bounds;
741
44
    meta.partition.set_partition_key_start(split_partition_key);
742
44
    meta.key_bounds.lower.Reset(split_encoded_key);
743
44
    metas.push_back(meta);
744
44
  }
745
746
44
  return metas;
747
44
}
748
749
}  // namespace
750
751
Status TSTabletManager::StartSubtabletsSplit(
752
0
    const RaftGroupMetadata& source_tablet_meta, SplitTabletsCreationMetaData* tcmetas) {
753
0
  auto* const env = fs_manager_->env();
754
755
0
  auto iter = tcmetas->begin();
756
0
  while (iter != tcmetas->end()) {
757
0
    const auto& subtablet_id = iter->tablet_id;
758
759
0
    auto transition_deleter_result = StartTabletStateTransitionForCreation(subtablet_id);
760
0
    if (transition_deleter_result.ok()) {
761
0
      iter->transition_deleter = *transition_deleter_result;
762
0
    } else if (transition_deleter_result.status().IsAlreadyPresent()) {
763
      // State transition for sub tablet with subtablet_id could be already registered because its
764
      // remote bootstrap (from already split parent tablet leader) is in progress.
765
0
      iter = tcmetas->erase(iter);
766
0
      continue;
767
0
    } else {
768
0
      return transition_deleter_result.status();
769
0
    }
770
771
    // Try to load metadata from previous not completed split.
772
0
    auto load_result = RaftGroupMetadata::Load(fs_manager_, subtablet_id);
773
0
    if (load_result.ok() && CanServeTabletData((*load_result)->tablet_data_state())) {
774
      // Sub tablet has been already created and ready during previous split attempt at this node or
775
      // as a result of remote bootstrap from another node, no need to re-create.
776
0
      iter = tcmetas->erase(iter);
777
0
      continue;
778
0
    }
779
780
    // Delete on-disk data for new tablet IDs in case it is present as a leftover from previously
781
    // failed tablet split attempt.
782
    // TODO(tsplit): add test for that.
783
0
    const auto data_dir = source_tablet_meta.GetSubRaftGroupDataDir(subtablet_id);
784
0
    if (env->FileExists(data_dir)) {
785
0
      RETURN_NOT_OK_PREPEND(
786
0
          env->DeleteRecursively(data_dir),
787
0
          Format("Unable to recursively delete data dir for tablet $0", subtablet_id));
788
0
    }
789
0
    RETURN_NOT_OK(Log::DeleteOnDiskData(
790
0
        env, subtablet_id, source_tablet_meta.GetSubRaftGroupWalDir(subtablet_id),
791
0
        fs_manager_->uuid()));
792
0
    RETURN_NOT_OK(ConsensusMetadata::DeleteOnDiskData(fs_manager_, subtablet_id));
793
794
0
    ++iter;
795
0
  }
796
0
  return Status::OK();
797
0
}
798
799
void TSTabletManager::CreatePeerAndOpenTablet(
800
    const tablet::RaftGroupMetadataPtr& meta,
801
86
    const scoped_refptr<TransitionInProgressDeleter>& deleter) {
802
86
  Status s = ResultToStatus(CreateAndRegisterTabletPeer(meta, NEW_PEER));
803
86
  if (!s.ok()) {
804
0
    s = s.CloneAndPrepend("Failed to create and register tablet peer");
805
0
    if (s.IsShutdownInProgress()) {
806
      // If shutdown is in progress, it is not a failure to not being able to create and register
807
      // tablet peer.
808
0
      LOG_WITH_PREFIX(WARNING) << s;
809
0
    } else {
810
0
      LOG_WITH_PREFIX(DFATAL) << s;
811
0
    }
812
0
    return;
813
0
  }
814
86
  s = open_tablet_pool_->SubmitFunc(std::bind(&TSTabletManager::OpenTablet, this, meta, deleter));
815
86
  if (!s.ok()) {
816
0
    LOG(DFATAL) << Format("Failed to schedule opening tablet $0: $1", meta->table_id(), s);
817
0
    return;
818
0
  }
819
86
}
820
821
47
Status TSTabletManager::ApplyTabletSplit(tablet::SplitOperation* operation, log::Log* raft_log) {
822
47
  if (PREDICT_FALSE(FLAGS_TEST_crash_before_apply_tablet_split_op)) {
823
2
    LOG(FATAL) << "Crashing due to FLAGS_TEST_crash_before_apply_tablet_split_op";
824
2
  }
825
826
47
  if (state() != MANAGER_RUNNING) {
827
0
    return STATUS_FORMAT(IllegalState, "Manager is not running: $0", state());
828
0
  }
829
830
47
  auto* tablet = CHECK_NOTNULL(operation->tablet());
831
47
  const auto tablet_id = tablet->tablet_id();
832
47
  const auto* request = operation->request();
833
47
  SCHECK_EQ(
834
47
      request->tablet_id(), tablet_id, IllegalState,
835
47
      Format(
836
47
          "Unexpected SPLIT_OP $0 designated for tablet $1 to be applied to tablet $2",
837
47
          operation->op_id(), request->tablet_id(), tablet_id));
838
47
  SCHECK(
839
47
      tablet_id != request->new_tablet1_id() && tablet_id != request->new_tablet2_id(),
840
47
      IllegalState,
841
47
      Format(
842
47
          "One of SPLIT_OP $0 destination tablet IDs ($1, $2) is the same as source tablet ID $3",
843
47
          operation->op_id(), request->new_tablet1_id(), request->new_tablet2_id(), tablet_id));
844
845
47
  LOG_WITH_PREFIX(INFO) << "Tablet " << tablet_id << " split operation apply started";
846
847
47
  if (raft_log == nullptr) {
848
44
    auto tablet_peer = VERIFY_RESULT(LookupTablet(tablet_id));
849
44
    raft_log = tablet_peer->raft_consensus()->log().get();
850
44
  }
851
852
47
  MAYBE_FAULT(FLAGS_TEST_fault_crash_in_split_before_log_flushed);
853
854
47
  RETURN_NOT_OK(raft_log->FlushIndex());
855
856
47
  auto& meta = *CHECK_NOTNULL(tablet->metadata());
857
858
  // TODO(tsplit): We can later implement better per-disk distribution during compaction of split
859
  // tablets.
860
47
  const auto table_id = meta.table_id();
861
47
  const auto data_root_dir =
862
47
      VERIFY_RESULT(GetAssignedRootDirForTablet(TabletDirType::kData, table_id, tablet_id));
863
47
  const auto wal_root_dir =
864
47
      VERIFY_RESULT(GetAssignedRootDirForTablet(TabletDirType::kWal, table_id, tablet_id));
865
866
47
  if (FLAGS_TEST_apply_tablet_split_inject_delay_ms > 0) {
867
2
    LOG(INFO) << "TEST: ApplyTabletSplit: injecting delay of "
868
2
              << FLAGS_TEST_apply_tablet_split_inject_delay_ms << " ms for "
869
2
              << AsString(*operation);
870
2
    std::this_thread::sleep_for(FLAGS_TEST_apply_tablet_split_inject_delay_ms * 1ms);
871
2
    LOG(INFO) << "TEST: ApplyTabletSplit: delay finished";
872
2
  }
873
874
47
  auto tcmetas = PrepareTabletCreationMetaDataForSplit(*request, *tablet);
875
876
47
  RETURN_NOT_OK(StartSubtabletsSplit(meta, &tcmetas));
877
878
88
  for (const auto& tcmeta : tcmetas) {
879
88
    RegisterDataAndWalDir(fs_manager_, table_id, tcmeta.tablet_id, data_root_dir, wal_root_dir);
880
88
  }
881
882
47
  bool successfully_completed = false;
883
43
  auto se = ScopeExit([&] {
884
43
    if (!successfully_completed) {
885
0
      for (const auto& tcmeta : tcmetas) {
886
0
        UnregisterDataWalDir(table_id, tcmeta.tablet_id, data_root_dir, wal_root_dir);
887
0
      }
888
0
    }
889
43
  });
890
891
47
  std::unique_ptr<ConsensusMetadata> cmeta;
892
47
  RETURN_NOT_OK(ConsensusMetadata::Load(fs_manager_, tablet_id, fs_manager_->uuid(), &cmeta));
893
894
87
  for (auto& tcmeta : tcmetas) {
895
87
    const auto& new_tablet_id = tcmeta.tablet_id;
896
897
    // Copy raft group metadata.
898
87
    tcmeta.raft_group_metadata = VERIFY_RESULT(tablet->CreateSubtablet(
899
87
        new_tablet_id, tcmeta.partition, tcmeta.key_bounds, operation->op_id(),
900
87
        operation->hybrid_time()));
901
87
    LOG_WITH_PREFIX(INFO) << "Created raft group metadata for table: " << table_id
902
87
                          << " tablet: " << new_tablet_id;
903
904
    // Copy consensus metadata.
905
    // Here we reuse the same cmeta instance for both new tablets. This is safe, because:
906
    // 1) Their consensus metadata only differ by tablet id.
907
    // 2) Flush() will save it into a new path corresponding to tablet id we set before flushing.
908
87
    cmeta->set_tablet_id(new_tablet_id);
909
87
    cmeta->set_split_parent_tablet_id(tablet_id);
910
87
    RETURN_NOT_OK(cmeta->Flush());
911
912
87
    const auto& dest_wal_dir = tcmeta.raft_group_metadata->wal_dir();
913
87
    RETURN_NOT_OK(raft_log->CopyTo(dest_wal_dir));
914
915
87
    MAYBE_FAULT(FLAGS_TEST_fault_crash_in_split_after_log_copied);
916
917
87
    tcmeta.raft_group_metadata->set_tablet_data_state(TABLET_DATA_READY);
918
87
    RETURN_NOT_OK(tcmeta.raft_group_metadata->Flush());
919
87
  }
920
921
47
  meta.SetSplitDone(operation->op_id(), request->new_tablet1_id(), request->new_tablet2_id());
922
47
  RETURN_NOT_OK(meta.Flush());
923
924
47
  tablet->SplitDone();
925
926
86
  for (auto& tcmeta : tcmetas) {
927
    // Call CreatePeerAndOpenTablet asynchronously to avoid write-locking TSTabletManager::mutex_
928
    // here since apply of SPLIT_OP is done under ReplicaState lock and this could lead to deadlock
929
    // in case of reverse lock order in some other thread.
930
    // See https://github.com/yugabyte/yugabyte-db/issues/4312 for more details.
931
86
    RETURN_NOT_OK(apply_pool_->SubmitFunc(std::bind(
932
86
        &TSTabletManager::CreatePeerAndOpenTablet, this, tcmeta.raft_group_metadata,
933
86
        tcmeta.transition_deleter)));
934
86
  }
935
936
47
  successfully_completed = true;
937
47
  LOG_WITH_PREFIX(INFO) << "Tablet " << tablet_id << " split operation has been applied";
938
47
  return Status::OK();
939
47
}
940
941
217k
string LogPrefix(const string& tablet_id, const string& uuid) {
942
217k
  return "T " + tablet_id + " P " + uuid + ": ";
943
217k
}
944
945
Status CheckLeaderTermNotLower(
946
    const string& tablet_id,
947
    const string& uuid,
948
    int64_t leader_term,
949
102
    int64_t last_logged_term) {
950
102
  if (PREDICT_FALSE(leader_term < last_logged_term)) {
951
0
    Status s = STATUS(InvalidArgument,
952
0
        Substitute("Leader has replica of tablet $0 with term $1 lower than last "
953
0
                   "logged term $2 on local replica. Rejecting remote bootstrap request",
954
0
                   tablet_id, leader_term, last_logged_term));
955
0
    LOG(WARNING) << LogPrefix(tablet_id, uuid) << "Remote bootstrap: " << s;
956
0
    return s;
957
0
  }
958
102
  return Status::OK();
959
102
}
960
961
Status HandleReplacingStaleTablet(
962
    RaftGroupMetadataPtr meta,
963
    TabletPeerPtr old_tablet_peer,
964
    const string& tablet_id,
965
    const string& uuid,
966
102
    const int64_t& leader_term) {
967
102
  TabletDataState data_state = meta->tablet_data_state();
968
102
  switch (data_state) {
969
0
    case TABLET_DATA_COPYING: {
970
      // This should not be possible due to the transition_in_progress_ "lock".
971
0
      LOG(FATAL) << LogPrefix(tablet_id, uuid) << " Remote bootstrap: "
972
0
                 << "Found tablet in TABLET_DATA_COPYING state during StartRemoteBootstrap()";
973
0
    }
974
102
    case TABLET_DATA_TOMBSTONED: {
975
102
      RETURN_NOT_OK(old_tablet_peer->CheckShutdownOrNotStarted());
976
102
      int64_t last_logged_term = meta->tombstone_last_logged_opid().term;
977
102
      RETURN_NOT_OK(CheckLeaderTermNotLower(tablet_id,
978
102
                                            uuid,
979
102
                                            leader_term,
980
102
                                            last_logged_term));
981
102
      break;
982
102
    }
983
0
    case TABLET_DATA_SPLIT_COMPLETED:
984
0
    case TABLET_DATA_READY: {
985
0
      if (tablet_id == master::kSysCatalogTabletId) {
986
0
        LOG(FATAL) << LogPrefix(tablet_id, uuid) << " Remote bootstrap: "
987
0
                   << "Found tablet in " << TabletDataState_Name(data_state)
988
0
                   << " state during StartRemoteBootstrap()";
989
0
      }
990
      // There's a valid race here that can lead us to come here:
991
      // 1. Leader sends a second remote bootstrap request as a result of receiving a
992
      // TABLET_NOT_FOUND from this tserver while it was in the middle of a remote bootstrap.
993
      // 2. The remote bootstrap request arrives after the first one is finished, and it is able to
994
      // grab the mutex.
995
      // 3. This tserver finds that it already has the metadata for the tablet, and determines that
996
      // it needs to replace the tablet setting replacing_tablet to true.
997
      // In this case, the master can simply ignore this error.
998
0
      return STATUS_FORMAT(
999
0
          IllegalState, "Tablet $0 in $1 state", tablet_id, TabletDataState_Name(data_state));
1000
0
    }
1001
0
    default: {
1002
0
      return STATUS(IllegalState,
1003
102
          Substitute("Found tablet $0 in unexpected state $1 for remote bootstrap.",
1004
102
                     tablet_id, TabletDataState_Name(data_state)));
1005
102
    }
1006
102
  }
1007
1008
102
  return Status::OK();
1009
102
}
1010
1011
4.70k
Status TSTabletManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB& req) {
1012
  // To prevent racing against Shutdown, we increment this as soon as we start. This should be done
1013
  // before checking for ClosingUnlocked, as on shutdown, we proceed in reverse:
1014
  // - first mark as closing
1015
  // - then wait for num_tablets_being_remote_bootstrapped_ == 0
1016
4.70k
  ++num_tablets_being_remote_bootstrapped_;
1017
4.70k
  auto private_addr = req.source_private_addr()[0].host();
1018
4.69k
  auto decrement_num_rbs_se = ScopeExit([this, &private_addr](){
1019
4.69k
    {
1020
4.69k
      std::lock_guard<RWMutex> lock(mutex_);
1021
4.69k
      auto iter = bootstrap_source_addresses_.find(private_addr);
1022
4.69k
      if (iter != bootstrap_source_addresses_.end()) {
1023
4.51k
        bootstrap_source_addresses_.erase(iter);
1024
4.51k
      }
1025
4.69k
    }
1026
4.69k
    --num_tablets_being_remote_bootstrapped_;
1027
4.69k
  });
1028
1029
4.70k
  LongOperationTracker tracker("StartRemoteBootstrap", 5s);
1030
1031
4.70k
  const string& tablet_id = req.tablet_id();
1032
4.70k
  const string& bootstrap_peer_uuid = req.bootstrap_peer_uuid();
1033
4.70k
  HostPort bootstrap_peer_addr = HostPortFromPB(DesiredHostPort(
1034
4.70k
      req.source_broadcast_addr(), req.source_private_addr(), req.source_cloud_info(),
1035
4.70k
      server_->MakeCloudInfoPB()));
1036
4.70k
  int64_t leader_term = req.caller_term();
1037
1038
4.70k
  const string kLogPrefix = TabletLogPrefix(tablet_id);
1039
1040
4.70k
  TabletPeerPtr old_tablet_peer;
1041
4.70k
  RaftGroupMetadataPtr meta;
1042
4.70k
  bool replacing_tablet = false;
1043
4.70k
  scoped_refptr<TransitionInProgressDeleter> deleter;
1044
4.70k
  {
1045
4.70k
    std::lock_guard<RWMutex> lock(mutex_);
1046
4.70k
    bootstrap_source_addresses_.emplace(private_addr);
1047
4.70k
    if (ClosingUnlocked()) {
1048
0
      auto result = STATUS_FORMAT(
1049
0
          IllegalState, "StartRemoteBootstrap in wrong state: $0",
1050
0
          TSTabletManagerStatePB_Name(state_));
1051
0
      LOG(WARNING) << kLogPrefix << result;
1052
0
      return result;
1053
0
    }
1054
1055
4.70k
    if (LookupTabletUnlocked(tablet_id, &old_tablet_peer)) {
1056
3.40k
      meta = old_tablet_peer->tablet_metadata();
1057
3.40k
      replacing_tablet = true;
1058
3.40k
    }
1059
4.70k
    RETURN_NOT_OK(StartTabletStateTransition(
1060
4.70k
        tablet_id, Substitute("remote bootstrapping tablet from peer $0", bootstrap_peer_uuid),
1061
4.70k
        &deleter));
1062
4.70k
  }
1063
1064
1.40k
  if (replacing_tablet) {
1065
    // Make sure the existing tablet peer is shut down and tombstoned.
1066
102
    RETURN_NOT_OK(HandleReplacingStaleTablet(meta,
1067
102
                                             old_tablet_peer,
1068
102
                                             tablet_id,
1069
102
                                             fs_manager_->uuid(),
1070
102
                                             leader_term));
1071
102
  }
1072
1073
1.40k
  string init_msg = kLogPrefix + Substitute("Initiating remote bootstrap from Peer $0 ($1)",
1074
1.40k
                                            bootstrap_peer_uuid, bootstrap_peer_addr.ToString());
1075
1.40k
  LOG(INFO) << init_msg;
1076
1.40k
  TRACE(init_msg);
1077
1078
1.40k
  auto rb_client = std::make_unique<RemoteBootstrapClient>(tablet_id, fs_manager_);
1079
1080
  // Download and persist the remote superblock in TABLET_DATA_COPYING state.
1081
1.40k
  if (replacing_tablet) {
1082
102
    RETURN_NOT_OK(rb_client->SetTabletToReplace(meta, leader_term));
1083
102
  }
1084
1.40k
  RETURN_NOT_OK(rb_client->Start(bootstrap_peer_uuid,
1085
1.40k
                                 &server_->proxy_cache(),
1086
1.40k
                                 bootstrap_peer_addr,
1087
1.40k
                                 &meta,
1088
1.40k
                                 this));
1089
1090
  // From this point onward, the superblock is persisted in TABLET_DATA_COPYING
1091
  // state, and we need to tombstone the tablet if additional steps prior to
1092
  // getting to a TABLET_DATA_READY state fail.
1093
1094
960
  if (PREDICT_FALSE(FLAGS_TEST_simulate_already_present_in_remote_bootstrap)) {
1095
2
    LOG_WITH_PREFIX(INFO)
1096
2
        << "Simulating AlreadyPresent error in TSTabletManager::StartRemoteBootstrap.";
1097
2
    return STATUS(AlreadyPresent, "failed");
1098
2
  }
1099
1100
  // Registering a non-initialized TabletPeer offers visibility through the Web UI.
1101
958
  TabletPeerPtr tablet_peer = VERIFY_RESULT(
1102
958
        CreateAndRegisterTabletPeer(meta, replacing_tablet ? REPLACEMENT_PEER : NEW_PEER));
1103
958
  MarkTabletBeingRemoteBootstrapped(tablet_peer->tablet_id(),
1104
958
      tablet_peer->tablet_metadata()->table_id());
1105
1106
  // TODO: If we ever make this method asynchronous, we need to move this code somewhere else.
1107
943
  auto se = ScopeExit([this, tablet_peer] {
1108
943
    UnmarkTabletBeingRemoteBootstrapped(tablet_peer->tablet_id(),
1109
943
        tablet_peer->tablet_metadata()->table_id());
1110
943
  });
1111
1112
  // Download all of the remote files.
1113
958
  TOMBSTONE_NOT_OK(rb_client->FetchAll(tablet_peer->status_listener()),
1114
958
                   meta,
1115
958
                   fs_manager_->uuid(),
1116
958
                   "Remote bootstrap: Unable to fetch data from remote peer " +
1117
958
                       bootstrap_peer_uuid + " (" + bootstrap_peer_addr.ToString() + ")",
1118
958
                   this);
1119
1120
956
  MAYBE_FAULT(FLAGS_TEST_fault_crash_after_rb_files_fetched);
1121
1122
  // Write out the last files to make the new replica visible and update the
1123
  // TabletDataState in the superblock to TABLET_DATA_READY.
1124
  // Finish() will call EndRemoteSession() and wait for the leader to successfully submit a
1125
  // ChangeConfig request (to change this server's role from PRE_VOTER or PRE_OBSERVER to VOTER or
1126
  // OBSERVER respectively). If the RPC times out, we will ignore the error (since the leader could
1127
  // have successfully submitted the ChangeConfig request and failed to respond in time)
1128
  // and check the committed config until we find that this server's role has changed, or until we
1129
  // time out which will cause us to tombstone the tablet.
1130
956
  TOMBSTONE_NOT_OK(rb_client->Finish(),
1131
956
                   meta,
1132
956
                   fs_manager_->uuid(),
1133
956
                   "Remote bootstrap: Failed calling Finish()",
1134
956
                   this);
1135
1136
956
  LOG(INFO) << kLogPrefix << "Remote bootstrap: Opening tablet";
1137
1138
  // TODO(hector):  ENG-3173: We need to simulate a failure in OpenTablet during remote bootstrap
1139
  // and verify that this tablet server gets remote bootstrapped again by the leader. We also need
1140
  // to check what happens when this server receives raft consensus requests since at this point,
1141
  // this tablet server could be a voter (if the ChangeRole request in Finish succeeded and its
1142
  // initial role was PRE_VOTER).
1143
956
  OpenTablet(meta, nullptr);
1144
  // If OpenTablet fails, tablet_peer->error() will be set.
1145
956
  RETURN_NOT_OK(ShutdownAndTombstoneTabletPeerNotOk(
1146
956
      tablet_peer->error(), tablet_peer, meta, fs_manager_->uuid(),
1147
956
      "Remote bootstrap: OpenTablet() failed", this));
1148
1149
956
  auto status = rb_client->VerifyChangeRoleSucceeded(tablet_peer->shared_consensus());
1150
956
  if (!status.ok()) {
1151
    // If for some reason this tserver wasn't promoted (e.g. from PRE-VOTER to VOTER), the leader
1152
    // will find out and do the CHANGE_CONFIG.
1153
3
    LOG(WARNING) << kLogPrefix << "Remote bootstrap finished. "
1154
3
                               << "Failure calling VerifyChangeRoleSucceeded: "
1155
3
                               << status.ToString();
1156
953
  } else {
1157
953
    LOG(INFO) << kLogPrefix << "Remote bootstrap for tablet ended successfully";
1158
953
  }
1159
1160
956
  WARN_NOT_OK(rb_client->Remove(), "Remove remote bootstrap sessions failed");
1161
1162
956
  return Status::OK();
1163
956
}
1164
1165
// Create and register a new TabletPeer, given tablet metadata.
1166
Result<TabletPeerPtr> TSTabletManager::CreateAndRegisterTabletPeer(
1167
83.4k
    const RaftGroupMetadataPtr& meta, RegisterTabletPeerMode mode) {
1168
83.4k
  TabletPeerPtr tablet_peer(new tablet::TabletPeer(
1169
83.4k
      meta,
1170
83.4k
      local_peer_pb_,
1171
83.4k
      scoped_refptr<server::Clock>(server_->clock()),
1172
83.4k
      fs_manager_->uuid(),
1173
83.4k
      Bind(&TSTabletManager::ApplyChange, Unretained(this), meta->raft_group_id()),
1174
83.4k
      metric_registry_,
1175
83.4k
      this,
1176
83.4k
      async_client_init_->get_client_future()));
1177
83.4k
  RETURN_NOT_OK(RegisterTablet(meta->raft_group_id(), tablet_peer, mode));
1178
83.4k
  return tablet_peer;
1179
83.4k
}
1180
1181
Status TSTabletManager::DeleteTablet(
1182
    const string& tablet_id,
1183
    TabletDataState delete_type,
1184
    const boost::optional<int64_t>& cas_config_opid_index_less_or_equal,
1185
    bool hide_only,
1186
49.3k
    boost::optional<TabletServerErrorPB::Code>* error_code) {
1187
1188
49.3k
  if (delete_type != TABLET_DATA_DELETED && delete_type != TABLET_DATA_TOMBSTONED) {
1189
0
    return STATUS(InvalidArgument, "DeleteTablet() requires an argument that is one of "
1190
0
                                   "TABLET_DATA_DELETED or TABLET_DATA_TOMBSTONED",
1191
0
                                   Substitute("Given: $0 ($1)",
1192
0
                                              TabletDataState_Name(delete_type), delete_type));
1193
0
  }
1194
1195
49.3k
  TRACE("Deleting tablet $0", tablet_id);
1196
1197
49.3k
  TabletPeerPtr tablet_peer;
1198
49.3k
  scoped_refptr<TransitionInProgressDeleter> deleter;
1199
49.3k
  {
1200
    // Acquire the lock in exclusive mode as we'll add a entry to the
1201
    // transition_in_progress_ map.
1202
49.3k
    std::lock_guard<RWMutex> lock(mutex_);
1203
49.3k
    TRACE("Acquired tablet manager lock");
1204
49.3k
    RETURN_NOT_OK(CheckRunningUnlocked(error_code));
1205
1206
49.3k
    if (!LookupTabletUnlocked(tablet_id, &tablet_peer)) {
1207
29
      *error_code = TabletServerErrorPB::TABLET_NOT_FOUND;
1208
29
      return STATUS(NotFound, "Tablet not found", tablet_id);
1209
29
    }
1210
    // Sanity check that the tablet's deletion isn't already in progress
1211
49.3k
    Status s = StartTabletStateTransition(tablet_id, "deleting tablet", &deleter);
1212
49.3k
    if (PREDICT_FALSE(!s.ok())) {
1213
1.59k
      *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING;
1214
1.59k
      return s;
1215
1.59k
    }
1216
47.7k
  }
1217
1218
  // If the tablet is already deleted, the CAS check isn't possible because
1219
  // consensus and therefore the log is not available.
1220
47.7k
  TabletDataState data_state = tablet_peer->tablet_metadata()->tablet_data_state();
1221
47.7k
  bool tablet_deleted = (data_state == TABLET_DATA_DELETED || data_state == TABLET_DATA_TOMBSTONED);
1222
1223
  // If a tablet peer is in the FAILED state, then we need to be able to tombstone or delete this
1224
  // tablet. If the tablet is tombstoned, then this TS can be remote bootstrapped with the same
1225
  // tablet.
1226
47.7k
  bool tablet_failed = tablet_peer->state() == RaftGroupStatePB::FAILED;
1227
1228
  // They specified an "atomic" delete. Check the committed config's opid_index.
1229
  // TODO: There's actually a race here between the check and shutdown, but
1230
  // it's tricky to fix. We could try checking again after the shutdown and
1231
  // restarting the tablet if the local replica committed a higher config
1232
  // change op during that time, or potentially something else more invasive.
1233
47.7k
  if (cas_config_opid_index_less_or_equal && !tablet_deleted && !tablet_failed) {
1234
839
    shared_ptr<consensus::Consensus> consensus = tablet_peer->shared_consensus();
1235
839
    if (!consensus) {
1236
0
      *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING;
1237
0
      return STATUS(IllegalState, "Consensus not available. Tablet shutting down");
1238
0
    }
1239
839
    RaftConfigPB committed_config = consensus->CommittedConfig();
1240
839
    if (committed_config.opid_index() > *cas_config_opid_index_less_or_equal) {
1241
146
      *error_code = TabletServerErrorPB::CAS_FAILED;
1242
146
      return STATUS(IllegalState, Substitute("Request specified cas_config_opid_index_less_or_equal"
1243
146
                                             " of $0 but the committed config has opid_index of $1",
1244
146
                                             *cas_config_opid_index_less_or_equal,
1245
146
                                             committed_config.opid_index()));
1246
146
    }
1247
47.6k
  }
1248
1249
47.6k
  RaftGroupMetadataPtr meta = tablet_peer->tablet_metadata();
1250
47.6k
  if (hide_only) {
1251
0
    meta->SetHidden(true);
1252
0
    return meta->Flush();
1253
0
  }
1254
  // No matter if the tablet was deleted (drop table), or tombstoned (potentially moved to a
1255
  // different TS), we do not need to flush rocksdb anymore, as this data is irrelevant.
1256
  //
1257
  // Note: This might change for PITR.
1258
47.6k
  bool delete_data = delete_type == TABLET_DATA_DELETED || delete_type == TABLET_DATA_TOMBSTONED;
1259
47.6k
  RETURN_NOT_OK(tablet_peer->Shutdown(tablet::IsDropTable(delete_data)));
1260
1261
47.6k
  yb::OpId last_logged_opid = tablet_peer->GetLatestLogEntryOpId();
1262
1263
47.6k
  Status s = DeleteTabletData(meta,
1264
47.6k
                              delete_type,
1265
47.6k
                              fs_manager_->uuid(),
1266
47.6k
                              last_logged_opid,
1267
47.6k
                              this);
1268
47.6k
  if (PREDICT_FALSE(!s.ok())) {
1269
0
    s = s.CloneAndPrepend(Substitute("Unable to delete on-disk data from tablet $0",
1270
0
                                     tablet_id));
1271
0
    LOG(WARNING) << s.ToString();
1272
0
    tablet_peer->SetFailed(s);
1273
0
    return s;
1274
0
  }
1275
1276
47.6k
  tablet_peer->status_listener()->StatusMessage("Deleted tablet blocks from disk");
1277
1278
  // We only remove DELETED tablets from the tablet map.
1279
47.6k
  if (delete_type == TABLET_DATA_DELETED) {
1280
46.8k
    std::lock_guard<RWMutex> lock(mutex_);
1281
46.8k
    RETURN_NOT_OK(CheckRunningUnlocked(error_code));
1282
0
    CHECK_EQ(1, tablet_map_.erase(tablet_id)) << tablet_id;
1283
46.8k
    dirty_tablets_.erase(tablet_id);
1284
46.8k
  }
1285
1286
  // We unregister TOMBSTONED tablets in addition to DELETED tablets because they do not have
1287
  // any more data on disk, so we shouldn't count these tablets when load balancing the disks.
1288
47.6k
  UnregisterDataWalDir(meta->table_id(),
1289
47.6k
                       tablet_id,
1290
47.6k
                       meta->data_root_dir(),
1291
47.6k
                       meta->wal_root_dir());
1292
1293
47.6k
  return Status::OK();
1294
47.6k
}
1295
1296
Status TSTabletManager::CheckRunningUnlocked(
1297
96.2k
    boost::optional<TabletServerErrorPB::Code>* error_code) const {
1298
96.2k
  if (state_ == MANAGER_RUNNING) {
1299
96.2k
    return Status::OK();
1300
96.2k
  }
1301
0
  *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING;
1302
0
  return STATUS(ServiceUnavailable, Substitute("Tablet Manager is not running: $0",
1303
0
                                               TSTabletManagerStatePB_Name(state_)));
1304
0
}
1305
1306
// NO_THREAD_SAFETY_ANALYSIS because this analysis does not work with unique_lock.
1307
Status TSTabletManager::StartTabletStateTransition(
1308
    const string& tablet_id,
1309
    const string& reason,
1310
136k
    scoped_refptr<TransitionInProgressDeleter>* deleter) NO_THREAD_SAFETY_ANALYSIS {
1311
136k
  std::unique_lock<std::mutex> lock(transition_in_progress_mutex_);
1312
136k
  const auto emplace_result = transition_in_progress_.emplace(tablet_id, reason);
1313
136k
  if (!emplace_result.second) {
1314
4.89k
    return STATUS_FORMAT(
1315
4.89k
        AlreadyPresent, "State transition of tablet $0 already in progress: $1", tablet_id,
1316
4.89k
        *emplace_result.first);
1317
4.89k
  }
1318
131k
  deleter->reset(new TransitionInProgressDeleter(
1319
131k
      &transition_in_progress_, &transition_in_progress_mutex_, tablet_id));
1320
131k
  return Status::OK();
1321
131k
}
1322
1323
68
bool TSTabletManager::IsTabletInTransition(const TabletId& tablet_id) const {
1324
68
  std::unique_lock<std::mutex> lock(transition_in_progress_mutex_);
1325
68
  return ContainsKey(transition_in_progress_, tablet_id);
1326
68
}
1327
1328
Status TSTabletManager::OpenTabletMeta(const string& tablet_id,
1329
228
                                       RaftGroupMetadataPtr* metadata) {
1330
228
  LOG(INFO) << "Loading metadata for tablet " << tablet_id;
1331
228
  TRACE("Loading metadata...");
1332
228
  auto load_result = RaftGroupMetadata::Load(fs_manager_, tablet_id);
1333
228
  RETURN_NOT_OK_PREPEND(load_result,
1334
228
                        Format("Failed to load tablet metadata for tablet id $0", tablet_id));
1335
228
  TRACE("Metadata loaded");
1336
228
  metadata->swap(*load_result);
1337
228
  return Status::OK();
1338
228
}
1339
1340
void TSTabletManager::OpenTablet(const RaftGroupMetadataPtr& meta,
1341
83.4k
                                 const scoped_refptr<TransitionInProgressDeleter>& deleter) {
1342
83.4k
  string tablet_id = meta->raft_group_id();
1343
83.4k
  TRACE_EVENT1("tserver", "TSTabletManager::OpenTablet",
1344
83.4k
               "tablet_id", tablet_id);
1345
1346
83.4k
  TabletPeerPtr tablet_peer;
1347
18.4E
  CHECK(LookupTablet(tablet_id, &tablet_peer))
1348
18.4E
      << "Tablet not registered prior to OpenTabletAsync call: " << tablet_id;
1349
1350
83.4k
  tablet::TabletPtr tablet;
1351
83.4k
  scoped_refptr<Log> log;
1352
83.4k
  const string kLogPrefix = TabletLogPrefix(tablet_id);
1353
1354
83.4k
  LOG(INFO) << kLogPrefix << "Bootstrapping tablet";
1355
83.4k
  TRACE("Bootstrapping tablet");
1356
1357
83.4k
  consensus::ConsensusBootstrapInfo bootstrap_info;
1358
83.4k
  consensus::RetryableRequests retryable_requests(kLogPrefix);
1359
83.4k
  yb::OpId split_op_id;
1360
1361
83.4k
  LOG_TIMING_PREFIX(INFO, kLogPrefix, "bootstrapping tablet") {
1362
    // Read flag before CAS to avoid TSAN race conflict with GetAllFlags.
1363
83.4k
    if (GetAtomicFlag(&FLAGS_TEST_force_single_tablet_failure) &&
1364
3
        CompareAndSetFlag(&FLAGS_TEST_force_single_tablet_failure,
1365
3
                          true /* expected */, false /* val */)) {
1366
3
      LOG(ERROR) << "Setting the state of a tablet to FAILED";
1367
3
      tablet_peer->SetFailed(STATUS(InternalError, "Setting tablet to failed state for test",
1368
3
                                    tablet_id));
1369
3
      return;
1370
3
    }
1371
1372
    // TODO: handle crash mid-creation of tablet? do we ever end up with a
1373
    // partially created tablet here?
1374
83.4k
    auto s = tablet_peer->SetBootstrapping();
1375
83.4k
    if (!s.ok()) {
1376
0
      LOG(ERROR) << kLogPrefix << "Tablet failed to set bootstrapping: " << s;
1377
0
      tablet_peer->SetFailed(s);
1378
0
      return;
1379
0
    }
1380
1381
83.4k
    tablet::TabletInitData tablet_init_data = {
1382
83.4k
      .metadata = meta,
1383
83.4k
      .client_future = async_client_init_->get_client_future(),
1384
83.4k
      .clock = scoped_refptr<server::Clock>(server_->clock()),
1385
83.4k
      .parent_mem_tracker = MemTracker::FindOrCreateTracker("Tablets", server_->mem_tracker()),
1386
83.4k
      .block_based_table_mem_tracker = mem_manager_->block_based_table_mem_tracker(),
1387
83.4k
      .metric_registry = metric_registry_,
1388
83.4k
      .log_anchor_registry = tablet_peer->log_anchor_registry(),
1389
83.4k
      .tablet_options = tablet_options_,
1390
83.4k
      .log_prefix_suffix = " P " + tablet_peer->permanent_uuid(),
1391
83.4k
      .transaction_participant_context = tablet_peer.get(),
1392
83.4k
      .local_tablet_filter = std::bind(&TSTabletManager::PreserveLocalLeadersOnly, this, _1),
1393
83.4k
      .transaction_coordinator_context = tablet_peer.get(),
1394
83.4k
      .txns_enabled = tablet::TransactionsEnabled::kTrue,
1395
      // We are assuming we're never dealing with the system catalog tablet in TSTabletManager.
1396
83.4k
      .is_sys_catalog = tablet::IsSysCatalogTablet::kFalse,
1397
83.4k
      .snapshot_coordinator = nullptr,
1398
83.4k
      .tablet_splitter = this,
1399
83.4k
      .allowed_history_cutoff_provider = std::bind(
1400
83.4k
          &TSTabletManager::AllowedHistoryCutoff, this, _1),
1401
83.4k
    };
1402
83.4k
    tablet::BootstrapTabletData data = {
1403
83.4k
      .tablet_init_data = tablet_init_data,
1404
83.4k
      .listener = tablet_peer->status_listener(),
1405
83.4k
      .append_pool = append_pool(),
1406
83.4k
      .allocation_pool = allocation_pool_.get(),
1407
83.4k
      .retryable_requests = &retryable_requests,
1408
83.4k
    };
1409
83.4k
    s = BootstrapTablet(data, &tablet, &log, &bootstrap_info);
1410
83.4k
    if (!s.ok()) {
1411
1
      LOG(ERROR) << kLogPrefix << "Tablet failed to bootstrap: " << s;
1412
1
      tablet_peer->SetFailed(s);
1413
1
      return;
1414
1
    }
1415
83.4k
  }
1416
1417
83.4k
  MonoTime start(MonoTime::Now());
1418
83.4k
  LOG_TIMING_PREFIX(INFO, kLogPrefix, "starting tablet") {
1419
83.4k
    TRACE("Initializing tablet peer");
1420
83.4k
    auto s = tablet_peer->InitTabletPeer(
1421
83.4k
        tablet,
1422
83.4k
        server_->mem_tracker(),
1423
83.4k
        server_->messenger(),
1424
83.4k
        &server_->proxy_cache(),
1425
83.4k
        log,
1426
83.4k
        tablet->GetTableMetricsEntity(),
1427
83.4k
        tablet->GetTabletMetricsEntity(),
1428
83.4k
        raft_pool(),
1429
83.4k
        tablet_prepare_pool(),
1430
83.4k
        &retryable_requests,
1431
83.4k
        multi_raft_manager_.get());
1432
1433
83.4k
    if (!s.ok()) {
1434
1
      LOG(ERROR) << kLogPrefix << "Tablet failed to init: "
1435
1
                 << s.ToString();
1436
1
      tablet_peer->SetFailed(s);
1437
1
      return;
1438
1
    }
1439
1440
83.4k
    TRACE("Starting tablet peer");
1441
83.4k
    s = tablet_peer->Start(bootstrap_info);
1442
83.4k
    if (!s.ok()) {
1443
2
      LOG(ERROR) << kLogPrefix << "Tablet failed to start: "
1444
2
                 << s.ToString();
1445
2
      tablet_peer->SetFailed(s);
1446
2
      return;
1447
2
    }
1448
1449
83.4k
    tablet_peer->RegisterMaintenanceOps(server_->maintenance_manager());
1450
83.4k
  }
1451
1452
83.4k
  auto elapsed_ms = MonoTime::Now().GetDeltaSince(start).ToMilliseconds();
1453
83.4k
  if (elapsed_ms > FLAGS_tablet_start_warn_threshold_ms) {
1454
11
    LOG(WARNING) << kLogPrefix << "Tablet startup took " << elapsed_ms << "ms";
1455
11
    if (Trace::CurrentTrace()) {
1456
0
      LOG(WARNING) << kLogPrefix << "Trace:" << std::endl
1457
0
                   << Trace::CurrentTrace()->DumpToString(true);
1458
0
    }
1459
11
  }
1460
1461
83.4k
  if (PREDICT_TRUE(!FLAGS_TEST_skip_post_split_compaction)) {
1462
83.3k
    WARN_NOT_OK(
1463
83.3k
    tablet->TriggerPostSplitCompactionIfNeeded([&]() {
1464
83.3k
      return post_split_trigger_compaction_pool_->NewToken(ThreadPool::ExecutionMode::SERIAL);
1465
83.3k
    }),
1466
83.3k
    "Failed to submit compaction for post-split tablet.");
1467
30
  } else {
1468
30
    LOG(INFO) << "Skipping post split compaction " << meta->raft_group_id();
1469
30
  }
1470
1471
83.4k
  if (tablet->ShouldDisableLbMove()) {
1472
84
    std::lock_guard<RWMutex> lock(mutex_);
1473
84
    tablets_blocked_from_lb_.insert(tablet->tablet_id());
1474
0
    VLOG(2) << TabletLogPrefix(tablet->tablet_id())
1475
0
            << " marking as maybe being compacted after split.";
1476
84
  }
1477
83.4k
}
1478
1479
0
Status TSTabletManager::TriggerCompactionAndWait(const TabletPtrs& tablets) {
1480
0
  CountDownLatch latch(tablets.size());
1481
0
  auto token = admin_triggered_compaction_pool_->NewToken(ThreadPool::ExecutionMode::CONCURRENT);
1482
0
  for (auto tablet : tablets) {
1483
0
    RETURN_NOT_OK(token->SubmitFunc([&latch, tablet]() {
1484
0
      WARN_NOT_OK(tablet->ForceFullRocksDBCompact(), "Failed to submit compaction for tablet.");
1485
0
      latch.CountDown();
1486
0
    }));
1487
0
  }
1488
0
  latch.Wait();
1489
0
  return Status::OK();
1490
0
}
1491
1492
76
void TSTabletManager::StartShutdown() {
1493
76
  {
1494
76
    std::lock_guard<RWMutex> lock(mutex_);
1495
76
    switch (state_) {
1496
0
      case MANAGER_QUIESCING: {
1497
0
        VLOG(1) << "Tablet manager shut down already in progress..";
1498
0
        return;
1499
0
      }
1500
0
      case MANAGER_SHUTDOWN: {
1501
0
        VLOG(1) << "Tablet manager has already been shut down.";
1502
0
        return;
1503
0
      }
1504
0
      case MANAGER_INITIALIZING:
1505
76
      case MANAGER_RUNNING: {
1506
76
        LOG_WITH_PREFIX(INFO) << "Shutting down tablet manager...";
1507
76
        state_ = MANAGER_QUIESCING;
1508
76
        break;
1509
0
      }
1510
0
      default: {
1511
0
        LOG(FATAL) << "Invalid state: " << TSTabletManagerStatePB_Name(state_);
1512
0
      }
1513
76
    }
1514
76
  }
1515
1516
76
  tablets_cleaner_->Shutdown();
1517
1518
76
  verify_tablet_data_poller_->Shutdown();
1519
1520
76
  metrics_cleaner_->Shutdown();
1521
1522
76
  async_client_init_->Shutdown();
1523
1524
76
  mem_manager_->Shutdown();
1525
1526
  // Wait for all RBS operations to finish.
1527
76
  const MonoDelta kSingleWait = 10ms;
1528
76
  const MonoDelta kReportInterval = 5s;
1529
76
  const MonoDelta kMaxWait = 30s;
1530
76
  MonoDelta waited = MonoDelta::kZero;
1531
76
  MonoDelta next_report_time = kReportInterval;
1532
76
  while (int remaining_rbs = num_tablets_being_remote_bootstrapped_ > 0) {
1533
0
    if (waited >= next_report_time) {
1534
0
      if (waited >= kMaxWait) {
1535
0
        std::string addr = "";
1536
0
        for (auto iter = bootstrap_source_addresses_.begin();
1537
0
             iter != bootstrap_source_addresses_.end();
1538
0
             iter++) {
1539
0
          if (iter == bootstrap_source_addresses_.begin()) {
1540
0
            addr += *iter;
1541
0
          } else {
1542
0
            addr += "," + *iter;
1543
0
          }
1544
0
        }
1545
0
        LOG_WITH_PREFIX(DFATAL)
1546
0
            << "Waited for " << waited << "ms. Still had "
1547
0
            << remaining_rbs << " pending remote bootstraps: " + addr;
1548
0
      } else {
1549
0
        LOG_WITH_PREFIX(WARNING)
1550
0
            << "Still waiting for " << remaining_rbs
1551
0
            << " ongoing RemoteBootstraps to finish after " << waited;
1552
0
      }
1553
0
      next_report_time = std::min(kMaxWait, waited + kReportInterval);
1554
0
    }
1555
0
    SleepFor(kSingleWait);
1556
0
    waited += kSingleWait;
1557
0
  }
1558
1559
  // Shut down the bootstrap pool, so new tablets are registered after this point.
1560
76
  open_tablet_pool_->Shutdown();
1561
1562
  // Take a snapshot of the peers list -- that way we don't have to hold
1563
  // on to the lock while shutting them down, which might cause a lock
1564
  // inversion. (see KUDU-308 for example).
1565
92
  for (const TabletPeerPtr& peer : GetTabletPeers()) {
1566
92
    if (peer->StartShutdown()) {
1567
91
      shutting_down_peers_.push_back(peer);
1568
91
    }
1569
92
  }
1570
76
}
1571
1572
73
void TSTabletManager::CompleteShutdown() {
1573
91
  for (const TabletPeerPtr& peer : shutting_down_peers_) {
1574
91
    peer->CompleteShutdown();
1575
91
  }
1576
1577
  // Shut down the apply pool.
1578
73
  apply_pool_->Shutdown();
1579
1580
73
  if (raft_pool_) {
1581
73
    raft_pool_->Shutdown();
1582
73
  }
1583
73
  if (tablet_prepare_pool_) {
1584
73
    tablet_prepare_pool_->Shutdown();
1585
73
  }
1586
73
  if (append_pool_) {
1587
73
    append_pool_->Shutdown();
1588
73
  }
1589
73
  if (post_split_trigger_compaction_pool_) {
1590
73
    post_split_trigger_compaction_pool_->Shutdown();
1591
73
  }
1592
73
  if (admin_triggered_compaction_pool_) {
1593
73
    admin_triggered_compaction_pool_->Shutdown();
1594
73
  }
1595
1596
73
  {
1597
73
    std::lock_guard<RWMutex> l(mutex_);
1598
73
    tablet_map_.clear();
1599
73
    dirty_tablets_.clear();
1600
1601
73
    std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_);
1602
73
    table_data_assignment_map_.clear();
1603
73
    table_wal_assignment_map_.clear();
1604
1605
73
    state_ = MANAGER_SHUTDOWN;
1606
73
  }
1607
73
}
1608
1609
106k
std::string TSTabletManager::LogPrefix() const {
1610
106k
  return "P " + fs_manager_->uuid() + ": ";
1611
106k
}
1612
1613
170k
std::string TSTabletManager::TabletLogPrefix(const TabletId& tablet_id) const {
1614
170k
  return tserver::LogPrefix(tablet_id, fs_manager_->uuid());
1615
170k
}
1616
1617
88.1k
bool TSTabletManager::ClosingUnlocked() const {
1618
88.1k
  return state_ == MANAGER_QUIESCING || state_ == MANAGER_SHUTDOWN;
1619
88.1k
}
1620
1621
Status TSTabletManager::RegisterTablet(const TabletId& tablet_id,
1622
                                       const TabletPeerPtr& tablet_peer,
1623
83.2k
                                       RegisterTabletPeerMode mode) {
1624
83.2k
  std::lock_guard<RWMutex> lock(mutex_);
1625
83.2k
  if (ClosingUnlocked()) {
1626
0
    auto result = STATUS_FORMAT(
1627
0
        ShutdownInProgress, "Unable to register tablet peer: $0: closing", tablet_id);
1628
0
    LOG(WARNING) << result;
1629
0
    return result;
1630
0
  }
1631
1632
  // If we are replacing a tablet peer, we delete the existing one first.
1633
83.2k
  if (mode == REPLACEMENT_PEER && tablet_map_.erase(tablet_id) != 1) {
1634
0
    auto result = STATUS_FORMAT(
1635
0
        NotFound, "Unable to remove previous tablet peer $0: not registered", tablet_id);
1636
0
    LOG(WARNING) << result;
1637
0
    return result;
1638
0
  }
1639
83.2k
  if (!InsertIfNotPresent(&tablet_map_, tablet_id, tablet_peer)) {
1640
0
    auto result = STATUS_FORMAT(
1641
0
        AlreadyPresent, "Unable to register tablet peer $0: already registered", tablet_id);
1642
0
    LOG(WARNING) << result;
1643
0
    return result;
1644
0
  }
1645
1646
83.2k
  LOG_WITH_PREFIX(INFO) << "Registered tablet " << tablet_id;
1647
1648
83.2k
  return Status::OK();
1649
83.2k
}
1650
1651
bool TSTabletManager::LookupTablet(const string& tablet_id,
1652
16.7M
                                   TabletPeerPtr* tablet_peer) const {
1653
16.7M
  SharedLock<RWMutex> shared_lock(mutex_);
1654
16.7M
  return LookupTabletUnlocked(tablet_id, tablet_peer);
1655
16.7M
}
1656
1657
Result<std::shared_ptr<tablet::TabletPeer>> TSTabletManager::LookupTablet(
1658
44
    const TabletId& tablet_id) const {
1659
44
  TabletPeerPtr tablet_peer;
1660
44
  SCHECK(LookupTablet(tablet_id, &tablet_peer), NotFound, Format("Tablet $0 not found", tablet_id));
1661
44
  return tablet_peer;
1662
44
}
1663
1664
bool TSTabletManager::LookupTabletUnlocked(const string& tablet_id,
1665
16.9M
                                           TabletPeerPtr* tablet_peer) const {
1666
16.9M
  const TabletPeerPtr* found = FindOrNull(tablet_map_, tablet_id);
1667
16.9M
  if (!found) {
1668
86.2k
    return false;
1669
86.2k
  }
1670
16.8M
  *tablet_peer = *found;
1671
16.8M
  return true;
1672
16.8M
}
1673
1674
Status TSTabletManager::GetTabletPeer(const string& tablet_id,
1675
16.6M
                                      TabletPeerPtr* tablet_peer) const {
1676
16.6M
  if (!LookupTablet(tablet_id, tablet_peer)) {
1677
2.47k
    return STATUS(NotFound, "Tablet not found", tablet_id);
1678
2.47k
  }
1679
16.6M
  TabletDataState data_state = (*tablet_peer)->tablet_metadata()->tablet_data_state();
1680
16.6M
  if (!CanServeTabletData(data_state)) {
1681
3.73k
    return STATUS(
1682
3.73k
        IllegalState, "Tablet data state not ready: " + TabletDataState_Name(data_state),
1683
3.73k
        tablet_id);
1684
3.73k
  }
1685
16.6M
  return Status::OK();
1686
16.6M
}
1687
1688
9.51M
const NodeInstancePB& TSTabletManager::NodeInstance() const {
1689
9.51M
  return server_->instance_pb();
1690
9.51M
}
1691
1692
1
Status TSTabletManager::GetRegistration(ServerRegistrationPB* reg) const {
1693
1
  return server_->GetRegistration(reg, server::RpcOnly::kTrue);
1694
1
}
1695
1696
42.7k
TSTabletManager::TabletPeers TSTabletManager::GetTabletPeers(TabletPtrs* tablet_ptrs) const {
1697
42.7k
  SharedLock<RWMutex> shared_lock(mutex_);
1698
42.7k
  TabletPeers peers;
1699
42.7k
  GetTabletPeersUnlocked(&peers);
1700
42.7k
  if (tablet_ptrs) {
1701
4
    for (const auto& peer : peers) {
1702
4
      if (!peer) continue;
1703
4
      auto tablet_ptr = peer->shared_tablet();
1704
4
      if (tablet_ptr) {
1705
4
        tablet_ptrs->push_back(tablet_ptr);
1706
4
      }
1707
4
    }
1708
4
  }
1709
42.7k
  return peers;
1710
42.7k
}
1711
1712
48.3k
void TSTabletManager::GetTabletPeersUnlocked(TabletPeers* tablet_peers) const {
1713
48.3k
  DCHECK(tablet_peers != nullptr);
1714
  // See AppendKeysFromMap for why this is done.
1715
48.3k
  if (tablet_peers->empty()) {
1716
48.3k
    tablet_peers->reserve(tablet_map_.size());
1717
48.3k
  }
1718
341k
  for (const auto& entry : tablet_map_) {
1719
341k
    if (entry.second != nullptr) {
1720
341k
      tablet_peers->push_back(entry.second);
1721
341k
    }
1722
341k
  }
1723
48.3k
}
1724
1725
243k
void TSTabletManager::PreserveLocalLeadersOnly(std::vector<const TabletId*>* tablet_ids) const {
1726
243k
  SharedLock<decltype(mutex_)> shared_lock(mutex_);
1727
4.86M
  auto filter = [this](const TabletId* id) REQUIRES_SHARED(mutex_) {
1728
4.86M
    auto it = tablet_map_.find(*id);
1729
4.86M
    if (it == tablet_map_.end()) {
1730
60
      return true;
1731
60
    }
1732
4.86M
    auto leader_status = it->second->LeaderStatus();
1733
4.86M
    return leader_status != consensus::LeaderStatus::LEADER_AND_READY;
1734
4.86M
  };
1735
243k
  tablet_ids->erase(std::remove_if(tablet_ids->begin(), tablet_ids->end(), filter),
1736
243k
                    tablet_ids->end());
1737
243k
}
1738
1739
void TSTabletManager::ApplyChange(const string& tablet_id,
1740
338k
                                  shared_ptr<consensus::StateChangeContext> context) {
1741
338k
  WARN_NOT_OK(
1742
338k
      apply_pool_->SubmitFunc(
1743
338k
          std::bind(&TSTabletManager::MarkTabletDirty, this, tablet_id, context)),
1744
338k
      "Unable to run MarkDirty callback")
1745
338k
}
1746
1747
void TSTabletManager::MarkTabletDirty(const TabletId& tablet_id,
1748
338k
                                      std::shared_ptr<consensus::StateChangeContext> context) {
1749
338k
  std::lock_guard<RWMutex> lock(mutex_);
1750
338k
  MarkDirtyUnlocked(tablet_id, context);
1751
338k
}
1752
1753
void TSTabletManager::MarkTabletBeingRemoteBootstrapped(
1754
958
    const TabletId& tablet_id, const TableId& table_id) {
1755
958
  std::lock_guard<RWMutex> lock(mutex_);
1756
958
  tablets_being_remote_bootstrapped_.insert(tablet_id);
1757
958
  tablets_being_remote_bootstrapped_per_table_[table_id].insert(tablet_id);
1758
958
  MaybeDoChecksForTests(table_id);
1759
958
  LOG(INFO) << "Concurrent remote bootstrap sessions: "
1760
958
            << tablets_being_remote_bootstrapped_.size()
1761
958
            << "Concurrent remote bootstrap sessions for table " << table_id
1762
958
            << ": " << tablets_being_remote_bootstrapped_per_table_[table_id].size();
1763
958
}
1764
1765
void TSTabletManager::UnmarkTabletBeingRemoteBootstrapped(
1766
943
    const TabletId& tablet_id, const TableId& table_id) {
1767
943
  std::lock_guard<RWMutex> lock(mutex_);
1768
943
  tablets_being_remote_bootstrapped_.erase(tablet_id);
1769
943
  tablets_being_remote_bootstrapped_per_table_[table_id].erase(tablet_id);
1770
943
}
1771
1772
0
size_t TSTabletManager::TEST_GetNumDirtyTablets() const {
1773
0
  SharedLock<RWMutex> lock(mutex_);
1774
0
  return dirty_tablets_.size();
1775
0
}
1776
1777
Status TSTabletManager::GetNumTabletsPendingBootstrap(
1778
9
    IsTabletServerReadyResponsePB* resp) const {
1779
9
  if (state() != MANAGER_RUNNING) {
1780
0
    resp->set_num_tablets_not_running(INT_MAX);
1781
0
    resp->set_total_tablets(INT_MAX);
1782
0
    return Status::OK();
1783
0
  }
1784
1785
9
  SharedLock<RWMutex> shared_lock(mutex_);
1786
9
  int num_pending = 0;
1787
9
  int total_tablets = 0;
1788
0
  for (const auto& entry : tablet_map_) {
1789
0
    RaftGroupStatePB state = entry.second->state();
1790
0
    TabletDataState data_state = entry.second->data_state();
1791
    // Do not count tablets that will never get to RUNNING state.
1792
0
    if (!CanServeTabletData(data_state)) {
1793
0
      continue;
1794
0
    }
1795
0
    bool not_started_or_bootstrap = state == NOT_STARTED || state == BOOTSTRAPPING;
1796
0
    if (not_started_or_bootstrap || state == RUNNING) {
1797
0
      total_tablets++;
1798
0
    }
1799
0
    if (not_started_or_bootstrap) {
1800
0
      num_pending++;
1801
0
    }
1802
0
  }
1803
1804
9
  LOG(INFO) << num_pending << " tablets pending bootstrap out of " << total_tablets;
1805
9
  resp->set_num_tablets_not_running(num_pending);
1806
9
  resp->set_total_tablets(total_tablets);
1807
1808
9
  return Status::OK();
1809
9
}
1810
1811
403k
int TSTabletManager::GetNumLiveTablets() const {
1812
403k
  int count = 0;
1813
403k
  SharedLock<RWMutex> lock(mutex_);
1814
7.03M
  for (const auto& entry : tablet_map_) {
1815
7.03M
    RaftGroupStatePB state = entry.second->state();
1816
7.03M
    if (state == BOOTSTRAPPING ||
1817
6.79M
        state == RUNNING) {
1818
6.71M
      count++;
1819
6.71M
    }
1820
7.03M
  }
1821
403k
  return count;
1822
403k
}
1823
1824
403k
int TSTabletManager::GetLeaderCount() const {
1825
403k
  int count = 0;
1826
403k
  SharedLock<RWMutex> lock(mutex_);
1827
7.03M
  for (const auto& entry : tablet_map_) {
1828
7.03M
    consensus::LeaderStatus leader_status = entry.second->LeaderStatus(/* allow_stale =*/ true);
1829
7.03M
    if (leader_status != consensus::LeaderStatus::NOT_LEADER) {
1830
1.91M
      count++;
1831
1.91M
    }
1832
7.03M
  }
1833
403k
  return count;
1834
403k
}
1835
1836
void TSTabletManager::MarkDirtyUnlocked(const TabletId& tablet_id,
1837
338k
                                        std::shared_ptr<consensus::StateChangeContext> context) {
1838
338k
  TabletReportState* state = FindOrNull(dirty_tablets_, tablet_id);
1839
338k
  if (state != nullptr) {
1840
142k
    CHECK_GE(next_report_seq_, state->change_seq);
1841
142k
    state->change_seq = next_report_seq_;
1842
196k
  } else {
1843
196k
    TabletReportState state;
1844
196k
    state.change_seq = next_report_seq_;
1845
196k
    InsertOrDie(&dirty_tablets_, tablet_id, state);
1846
196k
  }
1847
0
  VLOG(2) << TabletLogPrefix(tablet_id)
1848
0
          << "Marking dirty. Reason: " << AsString(context)
1849
0
          << ". Will report this tablet to the Master in the next heartbeat "
1850
0
          << "as part of report #" << next_report_seq_;
1851
338k
  server_->heartbeater()->TriggerASAP();
1852
338k
}
1853
1854
5.81k
void TSTabletManager::InitLocalRaftPeerPB() {
1855
5.81k
  DCHECK_EQ(state(), MANAGER_INITIALIZING);
1856
5.81k
  local_peer_pb_.set_permanent_uuid(fs_manager_->uuid());
1857
5.81k
  ServerRegistrationPB reg;
1858
5.81k
  CHECK_OK(server_->GetRegistration(&reg, server::RpcOnly::kTrue));
1859
5.81k
  TakeRegistration(&reg, &local_peer_pb_);
1860
5.81k
}
1861
1862
void TSTabletManager::CreateReportedTabletPB(const TabletPeerPtr& tablet_peer,
1863
263k
                                             ReportedTabletPB* reported_tablet) {
1864
263k
  reported_tablet->set_tablet_id(tablet_peer->tablet_id());
1865
263k
  reported_tablet->set_state(tablet_peer->state());
1866
263k
  reported_tablet->set_tablet_data_state(tablet_peer->tablet_metadata()->tablet_data_state());
1867
263k
  if (tablet_peer->state() == tablet::FAILED) {
1868
0
    AppStatusPB* error_status = reported_tablet->mutable_error();
1869
0
    StatusToPB(tablet_peer->error(), error_status);
1870
0
  }
1871
263k
  reported_tablet->set_schema_version(tablet_peer->tablet_metadata()->schema_version());
1872
1873
263k
  {
1874
263k
    auto tablet_ptr = tablet_peer->shared_tablet();
1875
263k
    if (tablet_ptr != nullptr) {
1876
260k
      reported_tablet->set_should_disable_lb_move(tablet_ptr->ShouldDisableLbMove());
1877
260k
    }
1878
263k
  }
1879
263k
  reported_tablet->set_fs_data_dir(tablet_peer->tablet_metadata()->data_root_dir());
1880
1881
  // We cannot get consensus state information unless the TabletPeer is running.
1882
263k
  shared_ptr<consensus::Consensus> consensus = tablet_peer->shared_consensus();
1883
263k
  if (consensus) {
1884
260k
    *reported_tablet->mutable_committed_consensus_state() =
1885
260k
        consensus->ConsensusState(consensus::CONSENSUS_CONFIG_COMMITTED);
1886
260k
  }
1887
1888
  // Set the hide status of the tablet.
1889
263k
  reported_tablet->set_is_hidden(tablet_peer->tablet_metadata()->hidden());
1890
263k
}
1891
1892
398k
void TSTabletManager::GenerateTabletReport(TabletReportPB* report, bool include_bootstrap) {
1893
398k
  report->Clear();
1894
  // Creating the tablet report can be slow in the case that it is in the
1895
  // middle of flushing its consensus metadata. We don't want to hold
1896
  // lock_ for too long, even in read mode, since it can cause other readers
1897
  // to block if there is a waiting writer (see KUDU-2193). So, we just make
1898
  // a local copy of the set of replicas.
1899
398k
  vector<std::shared_ptr<TabletPeer>> to_report;
1900
398k
  TabletIdSet tablet_ids;
1901
398k
  size_t dirty_count, report_limit;
1902
398k
  {
1903
398k
    std::lock_guard<RWMutex> write_lock(mutex_);
1904
398k
    uint32_t cur_report_seq = next_report_seq_++;
1905
398k
    report->set_sequence_number(cur_report_seq);
1906
1907
398k
    TabletIdSet::iterator i = tablets_blocked_from_lb_.begin();
1908
398k
    while (i != tablets_blocked_from_lb_.end()) {
1909
134
      TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, *i);
1910
134
      if (tablet_peer) {
1911
134
          const auto tablet = (*tablet_peer)->shared_tablet();
1912
          // If tablet is null, one of two things may be true:
1913
          // 1. TabletPeer::InitTabletPeer was not called yet
1914
          //
1915
          // Skip and keep tablet in tablets_blocked_from_lb_ till call InitTabletPeer.
1916
          //
1917
          // 2. TabletPeer::CompleteShutdown was called
1918
          //
1919
          // Tablet will be removed from tablets_blocked_from_lb_ with next GenerateTabletReport
1920
          // since tablet_peer will be removed from tablet_map_
1921
134
          if (tablet == nullptr) {
1922
0
            ++i;
1923
0
            continue;
1924
0
          }
1925
134
          const std::string& tablet_id = tablet->tablet_id();
1926
134
          if (!tablet->ShouldDisableLbMove()) {
1927
84
            i = tablets_blocked_from_lb_.erase(i);
1928
0
            VLOG(1) << "Tablet " << tablet_id << " is no longer blocked from load-balancing.";
1929
84
            InsertOrUpdate(&dirty_tablets_, tablet_id, TabletReportState{cur_report_seq});
1930
50
          } else {
1931
50
            ++i;
1932
50
          }
1933
0
      } else {
1934
0
          VLOG(1) << "Tablet " << *i
1935
0
                  << " was marked as blocked from load balancing but was not found";
1936
0
          i = tablets_blocked_from_lb_.erase(i);
1937
0
      }
1938
134
    }
1939
1940
398k
    if (include_bootstrap) {
1941
9.62k
      for (auto const& tablet_id : tablets_being_remote_bootstrapped_) {
1942
0
        VLOG(1) << "Tablet " << tablet_id << " being remote bootstrapped and marked for report";
1943
9.62k
        InsertOrUpdate(&dirty_tablets_, tablet_id, TabletReportState{cur_report_seq});
1944
9.62k
      }
1945
398k
    }
1946
261k
    for (const DirtyMap::value_type& dirty_entry : dirty_tablets_) {
1947
261k
      const TabletId& tablet_id = dirty_entry.first;
1948
261k
      tablet_ids.insert(tablet_id);
1949
261k
    }
1950
1951
261k
    for (auto const& tablet_id : tablet_ids) {
1952
261k
      TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, tablet_id);
1953
261k
      if (tablet_peer) {
1954
        // Dirty entry, report on it.
1955
261k
        to_report.push_back(*tablet_peer);
1956
0
      } else {
1957
        // Tell the Master that this tablet was removed from the TServer side.
1958
0
        report->add_removed_tablet_ids(tablet_id);
1959
        // Don't count this as a 'dirty_tablet_' because the Master may not have it either.
1960
0
        dirty_tablets_.erase(tablet_id);
1961
0
      }
1962
261k
    }
1963
398k
    dirty_count = dirty_tablets_.size();
1964
398k
    report_limit = report_limit_;
1965
398k
  }
1966
261k
  for (const auto& replica : to_report) {
1967
261k
    CreateReportedTabletPB(replica, report->add_updated_tablets());
1968
    // Enforce a max tablet limit on reported tablets.
1969
261k
    if (implicit_cast<size_t>(report->updated_tablets_size()) >= report_limit) break;
1970
261k
  }
1971
398k
  report->set_remaining_tablet_count(
1972
398k
      narrow_cast<int>(dirty_count - report->updated_tablets_size()));
1973
398k
}
1974
1975
5.59k
void TSTabletManager::StartFullTabletReport(TabletReportPB* report) {
1976
5.59k
  report->Clear();
1977
  // Creating the tablet report can be slow in the case that it is in the
1978
  // middle of flushing its consensus metadata. We don't want to hold
1979
  // lock_ for too long, even in read mode, since it can cause other readers
1980
  // to block if there is a waiting writer (see KUDU-2193). So, we just make
1981
  // a local copy of the set of replicas.
1982
5.59k
  vector<std::shared_ptr<TabletPeer>> to_report;
1983
5.59k
  size_t dirty_count, report_limit;
1984
5.59k
  {
1985
5.59k
    std::lock_guard<RWMutex> write_lock(mutex_);
1986
5.59k
    uint32_t cur_report_seq = next_report_seq_++;
1987
5.59k
    report->set_sequence_number(cur_report_seq);
1988
5.59k
    GetTabletPeersUnlocked(&to_report);
1989
    // Mark all tablets as dirty, to be cleaned when reading the heartbeat response.
1990
1.52k
    for (const auto& peer : to_report) {
1991
1.52k
      InsertOrUpdate(&dirty_tablets_, peer->tablet_id(), TabletReportState{cur_report_seq});
1992
1.52k
    }
1993
5.59k
    dirty_count = dirty_tablets_.size();
1994
5.59k
    report_limit = report_limit_;
1995
5.59k
  }
1996
1.52k
  for (const auto& replica : to_report) {
1997
1.52k
    CreateReportedTabletPB(replica, report->add_updated_tablets());
1998
    // Enforce a max tablet limit on reported tablets.
1999
1.52k
    if (implicit_cast<size_t>(report->updated_tablets_size()) >= report_limit) break;
2000
1.52k
  }
2001
5.59k
  report->set_remaining_tablet_count(
2002
5.59k
      narrow_cast<int32_t>(dirty_count - report->updated_tablets_size()));
2003
5.59k
}
2004
2005
void TSTabletManager::MarkTabletReportAcknowledged(uint32_t acked_seq,
2006
                                                   const TabletReportUpdatesPB& updates,
2007
396k
                                                   bool dirty_check) {
2008
396k
  std::lock_guard<RWMutex> l(mutex_);
2009
2010
396k
  CHECK_LT(acked_seq, next_report_seq_);
2011
2012
  // Clear the "dirty" state for any tablets processed in this report.
2013
260k
  for (auto const & tablet : updates.tablets()) {
2014
260k
    auto it = dirty_tablets_.find(tablet.tablet_id());
2015
260k
    if (it != dirty_tablets_.end()) {
2016
260k
      const TabletReportState& state = it->second;
2017
260k
      if (state.change_seq <= acked_seq) {
2018
        // This entry has not changed since this tablet report, we no longer need to track it
2019
        // as dirty. Next modification will be re-added with a higher sequence number.
2020
203k
        dirty_tablets_.erase(it);
2021
203k
      }
2022
260k
    }
2023
260k
  }
2024
396k
#ifndef NDEBUG
2025
  // Verify dirty_tablets_ always processes all tablet changes.
2026
396k
  if (dirty_check) {
2027
158k
    for (auto const & d : dirty_tablets_) {
2028
158k
      if (d.second.change_seq <= acked_seq) {
2029
0
        LOG(DFATAL) << "Dirty Tablet should have been reported but wasn't: "
2030
0
                    << d.first << "@" << d.second.change_seq << " <= " << acked_seq;
2031
0
      }
2032
158k
    }
2033
395k
  }
2034
396k
#endif
2035
396k
}
2036
2037
Status TSTabletManager::HandleNonReadyTabletOnStartup(
2038
9
    const RaftGroupMetadataPtr& meta) {
2039
9
  const string& tablet_id = meta->raft_group_id();
2040
9
  TabletDataState data_state = meta->tablet_data_state();
2041
0
  CHECK(data_state == TABLET_DATA_DELETED ||
2042
0
        data_state == TABLET_DATA_TOMBSTONED ||
2043
0
        data_state == TABLET_DATA_COPYING ||
2044
0
        data_state == TABLET_DATA_INIT_STARTED)
2045
0
      << "Unexpected TabletDataState in tablet " << tablet_id << ": "
2046
0
      << TabletDataState_Name(data_state) << " (" << data_state << ")";
2047
2048
9
  if (data_state == TABLET_DATA_COPYING) {
2049
    // We tombstone tablets that failed to remotely bootstrap.
2050
0
    data_state = TABLET_DATA_TOMBSTONED;
2051
0
  }
2052
2053
9
  if (data_state == TABLET_DATA_INIT_STARTED) {
2054
    // We delete tablets that failed to completely initialize after a split.
2055
    // TODO(tsplit): https://github.com/yugabyte/yugabyte-db/issues/8013
2056
1
    data_state = TABLET_DATA_DELETED;
2057
1
  }
2058
2059
9
  const string kLogPrefix = TabletLogPrefix(tablet_id);
2060
2061
  // If the tablet is already fully tombstoned with no remaining data or WAL,
2062
  // then no need to roll anything forward.
2063
9
  bool skip_deletion = meta->IsTombstonedWithNoRocksDBData() &&
2064
2
                       !Log::HasOnDiskData(meta->fs_manager(), meta->wal_dir());
2065
2066
7
  LOG_IF(WARNING, !skip_deletion)
2067
7
      << kLogPrefix << "Tablet Manager startup: Rolling forward tablet deletion "
2068
7
      << "of type " << TabletDataState_Name(data_state);
2069
2070
9
  if (!skip_deletion) {
2071
    // Passing no OpId will retain the last_logged_opid that was previously in the metadata.
2072
7
    RETURN_NOT_OK(DeleteTabletData(meta, data_state, fs_manager_->uuid(), yb::OpId()));
2073
7
  }
2074
2075
  // We only delete the actual superblock of a TABLET_DATA_DELETED tablet on startup.
2076
  // TODO: Consider doing this after a fixed delay, instead of waiting for a restart.
2077
  // See KUDU-941.
2078
9
  if (data_state == TABLET_DATA_DELETED) {
2079
7
    LOG(INFO) << kLogPrefix << "Deleting tablet superblock";
2080
7
    return meta->DeleteSuperBlock();
2081
7
  }
2082
2083
  // Register TOMBSTONED tablets so that they get reported to the Master, which
2084
  // allows us to permanently delete replica tombstones when a table gets deleted.
2085
2
  if (data_state == TABLET_DATA_TOMBSTONED) {
2086
2
    RETURN_NOT_OK(CreateAndRegisterTabletPeer(meta, NEW_PEER));
2087
2
  }
2088
2089
2
  return Status::OK();
2090
2
}
2091
2092
void TSTabletManager::GetAndRegisterDataAndWalDir(FsManager* fs_manager,
2093
                                                  const string& table_id,
2094
                                                  const string& tablet_id,
2095
                                                  string* data_root_dir,
2096
83.4k
                                                  string* wal_root_dir) {
2097
  // Skip sys catalog table and kudu table from modifying the map.
2098
83.4k
  if (table_id == master::kSysCatalogTableId) {
2099
0
    return;
2100
0
  }
2101
83.4k
  LOG(INFO) << "Get and update data/wal directory assignment map for table: " \
2102
83.4k
            << table_id << " and tablet " << tablet_id;
2103
83.4k
  std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_);
2104
  // Initialize the map if the directory mapping does not exist.
2105
83.4k
  auto data_root_dirs = fs_manager->GetDataRootDirs();
2106
18.4E
  CHECK(!data_root_dirs.empty()) << "No data root directories found";
2107
83.4k
  auto table_data_assignment_iter = table_data_assignment_map_.find(table_id);
2108
83.4k
  if (table_data_assignment_iter == table_data_assignment_map_.end()) {
2109
11.2k
    for (string data_root_iter : data_root_dirs) {
2110
11.2k
      unordered_set<string> tablet_id_set;
2111
11.2k
      table_data_assignment_map_[table_id][data_root_iter] = tablet_id_set;
2112
11.2k
    }
2113
11.1k
  }
2114
  // Find the data directory with the least count of tablets for this table.
2115
83.4k
  table_data_assignment_iter = table_data_assignment_map_.find(table_id);
2116
83.4k
  auto data_assignment_value_map = table_data_assignment_iter->second;
2117
83.4k
  string min_dir;
2118
83.4k
  uint64_t min_dir_count = kuint64max;
2119
167k
  for (auto it = data_assignment_value_map.begin(); it != data_assignment_value_map.end(); ++it) {
2120
83.9k
    if (min_dir_count > it->second.size()) {
2121
83.6k
      min_dir = it->first;
2122
83.6k
      min_dir_count = it->second.size();
2123
83.6k
    }
2124
83.9k
  }
2125
83.4k
  *data_root_dir = min_dir;
2126
  // Increment the count for min_dir.
2127
83.4k
  auto data_assignment_value_iter = table_data_assignment_map_[table_id].find(min_dir);
2128
83.4k
  data_assignment_value_iter->second.insert(tablet_id);
2129
2130
  // Find the wal directory with the least count of tablets for this table.
2131
83.4k
  min_dir = "";
2132
83.4k
  min_dir_count = kuint64max;
2133
83.4k
  auto wal_root_dirs = fs_manager->GetWalRootDirs();
2134
18.4E
  CHECK(!wal_root_dirs.empty()) << "No wal root directories found";
2135
83.4k
  auto table_wal_assignment_iter = table_wal_assignment_map_.find(table_id);
2136
83.4k
  if (table_wal_assignment_iter == table_wal_assignment_map_.end()) {
2137
11.2k
    for (string wal_root_iter : wal_root_dirs) {
2138
11.2k
      unordered_set<string> tablet_id_set;
2139
11.2k
      table_wal_assignment_map_[table_id][wal_root_iter] = tablet_id_set;
2140
11.2k
    }
2141
11.1k
  }
2142
83.4k
  table_wal_assignment_iter = table_wal_assignment_map_.find(table_id);
2143
83.4k
  auto wal_assignment_value_map = table_wal_assignment_iter->second;
2144
167k
  for (auto it = wal_assignment_value_map.begin(); it != wal_assignment_value_map.end(); ++it) {
2145
83.9k
    if (min_dir_count > it->second.size()) {
2146
83.6k
      min_dir = it->first;
2147
83.6k
      min_dir_count = it->second.size();
2148
83.6k
    }
2149
83.9k
  }
2150
83.4k
  *wal_root_dir = min_dir;
2151
83.4k
  auto wal_assignment_value_iter = table_wal_assignment_map_[table_id].find(min_dir);
2152
83.4k
  wal_assignment_value_iter->second.insert(tablet_id);
2153
83.4k
}
2154
2155
void TSTabletManager::RegisterDataAndWalDir(FsManager* fs_manager,
2156
                                            const string& table_id,
2157
                                            const string& tablet_id,
2158
                                            const string& data_root_dir,
2159
407
                                            const string& wal_root_dir) {
2160
  // Skip sys catalog table from modifying the map.
2161
407
  if (table_id == master::kSysCatalogTableId) {
2162
0
    return;
2163
0
  }
2164
407
  LOG(INFO) << "Update data/wal directory assignment map for table: "
2165
407
            << table_id << " and tablet " << tablet_id;
2166
407
  std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_);
2167
  // Initialize the map if the directory mapping does not exist.
2168
407
  auto data_root_dirs = fs_manager->GetDataRootDirs();
2169
0
  CHECK(!data_root_dirs.empty()) << "No data root directories found";
2170
407
  auto table_data_assignment_iter = table_data_assignment_map_.find(table_id);
2171
407
  if (table_data_assignment_iter == table_data_assignment_map_.end()) {
2172
119
    for (string data_root_iter : data_root_dirs) {
2173
119
      unordered_set<string> tablet_id_set;
2174
119
      table_data_assignment_map_[table_id][data_root_iter] = tablet_id_set;
2175
119
    }
2176
82
  }
2177
  // Increment the count for data_root_dir.
2178
407
  table_data_assignment_iter = table_data_assignment_map_.find(table_id);
2179
407
  auto data_assignment_value_map = table_data_assignment_iter->second;
2180
407
  auto data_assignment_value_iter = table_data_assignment_map_[table_id].find(data_root_dir);
2181
407
  if (data_assignment_value_iter == table_data_assignment_map_[table_id].end()) {
2182
0
    unordered_set<string> tablet_id_set;
2183
0
    tablet_id_set.insert(tablet_id);
2184
0
    table_data_assignment_map_[table_id][data_root_dir] = tablet_id_set;
2185
407
  } else {
2186
407
    data_assignment_value_iter->second.insert(tablet_id);
2187
407
  }
2188
2189
407
  auto wal_root_dirs = fs_manager->GetWalRootDirs();
2190
0
  CHECK(!wal_root_dirs.empty()) << "No wal root directories found";
2191
407
  auto table_wal_assignment_iter = table_wal_assignment_map_.find(table_id);
2192
407
  if (table_wal_assignment_iter == table_wal_assignment_map_.end()) {
2193
119
    for (string wal_root_iter : wal_root_dirs) {
2194
119
      unordered_set<string> tablet_id_set;
2195
119
      table_wal_assignment_map_[table_id][wal_root_iter] = tablet_id_set;
2196
119
    }
2197
82
  }
2198
  // Increment the count for wal_root_dir.
2199
407
  table_wal_assignment_map_[table_id][wal_root_dir].insert(tablet_id);
2200
407
  table_wal_assignment_iter = table_wal_assignment_map_.find(table_id);
2201
407
  auto wal_assignment_value_map = table_wal_assignment_iter->second;
2202
407
  auto wal_assignment_value_iter = table_wal_assignment_map_[table_id].find(wal_root_dir);
2203
407
  if (wal_assignment_value_iter == table_wal_assignment_map_[table_id].end()) {
2204
0
    unordered_set<string> tablet_id_set;
2205
0
    tablet_id_set.insert(tablet_id);
2206
0
    table_wal_assignment_map_[table_id][wal_root_dir] = tablet_id_set;
2207
407
  } else {
2208
407
    wal_assignment_value_iter->second.insert(tablet_id);
2209
407
  }
2210
407
}
2211
2212
TSTabletManager::TableDiskAssignmentMap* TSTabletManager::GetTableDiskAssignmentMapUnlocked(
2213
88
    TabletDirType dir_type) {
2214
88
  switch (dir_type) {
2215
44
    case TabletDirType::kData:
2216
44
      return &table_data_assignment_map_;
2217
44
    case TabletDirType::kWal:
2218
44
      return &table_wal_assignment_map_;
2219
0
  }
2220
0
  FATAL_INVALID_ENUM_VALUE(TabletDirType, dir_type);
2221
0
}
2222
2223
Result<const std::string&> TSTabletManager::GetAssignedRootDirForTablet(
2224
88
    TabletDirType dir_type, const TableId& table_id, const TabletId& tablet_id) {
2225
88
  std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_);
2226
2227
88
  TableDiskAssignmentMap* table_assignment_map = GetTableDiskAssignmentMapUnlocked(dir_type);
2228
88
  auto tablets_by_root_dir = table_assignment_map->find(table_id);
2229
88
  if (tablets_by_root_dir == table_assignment_map->end()) {
2230
0
    return STATUS_FORMAT(
2231
0
        IllegalState, "Table ID $0 is not in $1 table assignment map", table_id, dir_type);
2232
0
  }
2233
88
  for (auto& data_dir_and_tablets : tablets_by_root_dir->second) {
2234
88
    if (data_dir_and_tablets.second.count(tablet_id) > 0) {
2235
88
      return data_dir_and_tablets.first;
2236
88
    }
2237
88
  }
2238
0
  return STATUS_FORMAT(
2239
88
      IllegalState, "Tablet ID $0 is not found in $1 assignment map for table $2", tablet_id,
2240
88
      dir_type, table_id);
2241
88
}
2242
2243
void TSTabletManager::UnregisterDataWalDir(const string& table_id,
2244
                                           const string& tablet_id,
2245
                                           const string& data_root_dir,
2246
48.0k
                                           const string& wal_root_dir) {
2247
  // Skip sys catalog table from modifying the map.
2248
48.0k
  if (table_id == master::kSysCatalogTableId) {
2249
0
    return;
2250
0
  }
2251
48.0k
  LOG(INFO) << "Unregister data/wal directory assignment map for table: "
2252
48.0k
            << table_id << " and tablet " << tablet_id;
2253
48.0k
  std::lock_guard<std::mutex> lock(dir_assignment_mutex_);
2254
48.0k
  auto table_data_assignment_iter = table_data_assignment_map_.find(table_id);
2255
48.0k
  if (table_data_assignment_iter == table_data_assignment_map_.end()) {
2256
    // It is possible that we can't find an assignment for the table if the operations followed in
2257
    // this order:
2258
    // 1. The only tablet for a table gets tombstoned, and UnregisterDataWalDir removes it from
2259
    //    the maps.
2260
    // 2. TSTabletManager gets restarted (so the maps are cleared).
2261
    // 3. During TsTabletManager initialization, the tombstoned TABLET won't get registered,
2262
    //    so if a DeleteTablet request with type DELETED gets sent, UnregisterDataWalDir won't
2263
    //    find the table.
2264
2265
    // Check that both maps should be consistent.
2266
0
    DCHECK(table_wal_assignment_map_.find(table_id) == table_wal_assignment_map_.end());
2267
0
  }
2268
48.0k
  if (table_data_assignment_iter != table_data_assignment_map_.end()) {
2269
48.0k
    auto data_assignment_value_iter = table_data_assignment_map_[table_id].find(data_root_dir);
2270
0
    DCHECK(data_assignment_value_iter != table_data_assignment_map_[table_id].end())
2271
0
      << "No data directory index found for table: " << table_id;
2272
48.0k
    if (data_assignment_value_iter != table_data_assignment_map_[table_id].end()) {
2273
48.0k
      data_assignment_value_iter->second.erase(tablet_id);
2274
0
    } else {
2275
0
      LOG(WARNING) << "Tablet " << tablet_id << " not in the set for data directory "
2276
0
                   << data_root_dir << "for table " << table_id;
2277
0
    }
2278
48.0k
  }
2279
48.0k
  auto table_wal_assignment_iter = table_wal_assignment_map_.find(table_id);
2280
48.0k
  if (table_wal_assignment_iter != table_wal_assignment_map_.end()) {
2281
48.0k
    auto wal_assignment_value_iter = table_wal_assignment_map_[table_id].find(wal_root_dir);
2282
0
    DCHECK(wal_assignment_value_iter != table_wal_assignment_map_[table_id].end())
2283
0
      << "No wal directory index found for table: " << table_id;
2284
48.0k
    if (wal_assignment_value_iter != table_wal_assignment_map_[table_id].end()) {
2285
48.0k
      wal_assignment_value_iter->second.erase(tablet_id);
2286
0
    } else {
2287
0
      LOG(WARNING) << "Tablet " << tablet_id << " not in the set for wal directory "
2288
0
                   << wal_root_dir << "for table " << table_id;
2289
0
    }
2290
48.0k
  }
2291
48.0k
}
2292
2293
8
client::YBClient& TSTabletManager::client() {
2294
8
  return *async_client_init_->client();
2295
8
}
2296
2297
51.7k
const std::shared_future<client::YBClient*>& TSTabletManager::client_future() {
2298
51.7k
  return async_client_init_->get_client_future();
2299
51.7k
}
2300
2301
958
void TSTabletManager::MaybeDoChecksForTests(const TableId& table_id) {
2302
  // First check that the global RBS limits are respected if the flag is non-zero.
2303
958
  if (PREDICT_FALSE(FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than > 0) &&
2304
24
      tablets_being_remote_bootstrapped_.size() >
2305
0
          FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than) {
2306
0
    string tablets;
2307
    // The purpose of limiting the number of remote bootstraps is to cap how much
2308
    // network bandwidth all the RBS sessions use.
2309
    // When we finish transferring the files, we wait until the role of the new peer
2310
    // has been changed from PRE_VOTER to VOTER before we remove the tablet_id
2311
    // from tablets_being_remote_bootstrapped_. Since it's possible to be here
2312
    // because a few tablets are already open, and in the RUNNING state, but still
2313
    // in the tablets_being_remote_bootstrapped_ list, we check the state of each
2314
    // tablet before deciding if the load balancer has violated the concurrent RBS limit.
2315
0
    size_t count = 0;
2316
0
    for (const auto& tablet_id : tablets_being_remote_bootstrapped_) {
2317
0
      TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, tablet_id);
2318
0
      if (tablet_peer && (*tablet_peer)->state() == RaftGroupStatePB::RUNNING) {
2319
0
        continue;
2320
0
      }
2321
0
      if (!tablets.empty()) {
2322
0
        tablets += ", ";
2323
0
      }
2324
0
      tablets += tablet_id;
2325
0
      count++;
2326
0
    }
2327
0
    if (count > FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than) {
2328
0
      LOG(FATAL) << "Exceeded the specified maximum number of concurrent remote bootstrap sessions."
2329
0
                 << " Specified: " << FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than
2330
0
                 << ", number concurrent remote bootstrap sessions: "
2331
0
                 << tablets_being_remote_bootstrapped_.size() << ", for tablets: " << tablets;
2332
0
    }
2333
0
  }
2334
2335
  // Check that the per-table RBS limits are respected if the flag is non-zero.
2336
958
  if (PREDICT_FALSE(FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than > 0) &&
2337
24
      tablets_being_remote_bootstrapped_per_table_[table_id].size() >
2338
0
          FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than) {
2339
0
    string tablets;
2340
0
    size_t count = 0;
2341
0
    for (const auto& tablet_id : tablets_being_remote_bootstrapped_per_table_[table_id]) {
2342
0
      TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, tablet_id);
2343
0
      if (tablet_peer && (*tablet_peer)->state() == RaftGroupStatePB::RUNNING) {
2344
0
        continue;
2345
0
      }
2346
0
      if (!tablets.empty()) {
2347
0
        tablets += ", ";
2348
0
      }
2349
0
      tablets += tablet_id;
2350
0
      count++;
2351
0
    }
2352
0
    if (count > FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than) {
2353
0
      LOG(FATAL) << "Exceeded the specified maximum number of concurrent remote bootstrap "
2354
0
                 << "sessions per table. Specified: "
2355
0
                 << FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than
2356
0
                 << ", number of concurrent remote bootstrap sessions for table " << table_id
2357
0
                 << ": " << tablets_being_remote_bootstrapped_per_table_[table_id].size()
2358
0
                 << ", for tablets: " << tablets;
2359
0
    }
2360
0
  }
2361
958
}
2362
2363
396k
Status TSTabletManager::UpdateSnapshotsInfo(const master::TSSnapshotsInfoPB& info) {
2364
396k
  bool restorations_updated;
2365
396k
  RestorationCompleteTimeMap restoration_complete_time;
2366
396k
  {
2367
396k
    std::lock_guard<simple_spinlock> lock(snapshot_schedule_allowed_history_cutoff_mutex_);
2368
396k
    ++snapshot_schedules_version_;
2369
396k
    snapshot_schedule_allowed_history_cutoff_.clear();
2370
0
    for (const auto& schedule : info.schedules()) {
2371
0
      auto schedule_id = VERIFY_RESULT(FullyDecodeSnapshotScheduleId(schedule.id()));
2372
0
      snapshot_schedule_allowed_history_cutoff_.emplace(
2373
0
          schedule_id, HybridTime::FromPB(schedule.last_snapshot_hybrid_time()));
2374
0
      missing_snapshot_schedules_.erase(schedule_id);
2375
0
    }
2376
396k
    HybridTime restorations_update_ht(info.last_restorations_update_ht());
2377
396k
    restorations_updated = restorations_update_ht != last_restorations_update_ht_;
2378
396k
    if (restorations_updated) {
2379
5.58k
      last_restorations_update_ht_ = restorations_update_ht;
2380
0
      for (const auto& entry : info.restorations()) {
2381
0
        auto id = VERIFY_RESULT(FullyDecodeTxnSnapshotRestorationId(entry.id()));
2382
0
        auto complete_time = HybridTime::FromPB(entry.complete_time_ht());
2383
0
        restoration_complete_time.emplace(id, complete_time);
2384
0
      }
2385
5.58k
    }
2386
396k
  }
2387
396k
  if (!restorations_updated) {
2388
391k
    return Status::OK();
2389
391k
  }
2390
5.58k
  std::vector<tablet::TabletPtr> tablets;
2391
5.58k
  {
2392
5.58k
    SharedLock<RWMutex> shared_lock(mutex_);
2393
5.58k
    tablets.reserve(tablet_map_.size());
2394
1.53k
    for (const auto& entry : tablet_map_) {
2395
1.53k
      auto tablet = entry.second->shared_tablet();
2396
1.53k
      if (tablet) {
2397
1.41k
        tablets.push_back(tablet);
2398
1.41k
      }
2399
1.53k
    }
2400
5.58k
  }
2401
1.41k
  for (const auto& tablet : tablets) {
2402
1.41k
    RETURN_NOT_OK(tablet->CheckRestorations(restoration_complete_time));
2403
1.41k
  }
2404
5.58k
  return Status::OK();
2405
5.58k
}
2406
2407
412
HybridTime TSTabletManager::AllowedHistoryCutoff(tablet::RaftGroupMetadata* metadata) {
2408
412
  auto schedules = metadata->SnapshotSchedules();
2409
412
  if (schedules.empty()) {
2410
412
    return HybridTime::kMax;
2411
412
  }
2412
0
  std::vector<SnapshotScheduleId> schedules_to_remove;
2413
0
  auto se = ScopeExit([&schedules_to_remove, metadata]() {
2414
0
    if (schedules_to_remove.empty()) {
2415
0
      return;
2416
0
    }
2417
0
    bool any_removed = false;
2418
0
    for (const auto& schedule_id : schedules_to_remove) {
2419
0
      any_removed = metadata->RemoveSnapshotSchedule(schedule_id) || any_removed;
2420
0
    }
2421
0
    if (any_removed) {
2422
0
      WARN_NOT_OK(metadata->Flush(), "Failed to flush metadata");
2423
0
    }
2424
0
  });
2425
0
  std::lock_guard<simple_spinlock> lock(snapshot_schedule_allowed_history_cutoff_mutex_);
2426
0
  HybridTime result = HybridTime::kMax;
2427
0
  for (const auto& schedule_id : schedules) {
2428
0
    auto it = snapshot_schedule_allowed_history_cutoff_.find(schedule_id);
2429
0
    if (it == snapshot_schedule_allowed_history_cutoff_.end()) {
2430
      // We don't know this schedule.
2431
0
      auto emplace_result = missing_snapshot_schedules_.emplace(
2432
0
          schedule_id, snapshot_schedules_version_);
2433
0
      if (!emplace_result.second &&
2434
0
          emplace_result.first->second + 2 <= snapshot_schedules_version_) {
2435
        // We don't know this schedule, and there are already 2 rounds of heartbeat passed
2436
        // after we first time found that we don't know this schedule.
2437
        // So it means that schedule was deleted.
2438
        // One round is not enough, because schedule could be added after heartbeat processed on
2439
        // master, but response not yet received on TServer.
2440
0
        schedules_to_remove.push_back(schedule_id);
2441
0
        continue;
2442
0
      }
2443
0
      return HybridTime::kMin;
2444
0
    }
2445
0
    if (!it->second) {
2446
      // Schedules does not have snapshots yet.
2447
0
      return HybridTime::kMin;
2448
0
    }
2449
0
    result = std::min(result, it->second);
2450
0
  }
2451
0
  return result;
2452
0
}
2453
2454
Status DeleteTabletData(const RaftGroupMetadataPtr& meta,
2455
                        TabletDataState data_state,
2456
                        const string& uuid,
2457
                        const yb::OpId& last_logged_opid,
2458
47.6k
                        TSTabletManager* ts_manager) {
2459
47.6k
  const string& tablet_id = meta->raft_group_id();
2460
47.6k
  const string kLogPrefix = LogPrefix(tablet_id, uuid);
2461
47.6k
  LOG(INFO) << kLogPrefix << "Deleting tablet data with delete state "
2462
47.6k
            << TabletDataState_Name(data_state);
2463
0
  CHECK(data_state == TABLET_DATA_DELETED ||
2464
0
        data_state == TABLET_DATA_TOMBSTONED)
2465
0
      << "Unexpected data_state to delete tablet " << meta->raft_group_id() << ": "
2466
0
      << TabletDataState_Name(data_state) << " (" << data_state << ")";
2467
2468
  // Note: Passing an unset 'last_logged_opid' will retain the last_logged_opid
2469
  // that was previously in the metadata.
2470
47.6k
  RETURN_NOT_OK(meta->DeleteTabletData(data_state, last_logged_opid));
2471
47.6k
  LOG(INFO) << kLogPrefix << "Tablet deleted. Last logged OpId: "
2472
47.6k
            << meta->tombstone_last_logged_opid();
2473
47.6k
  MAYBE_FAULT(FLAGS_TEST_fault_crash_after_blocks_deleted);
2474
2475
47.6k
  RETURN_NOT_OK(Log::DeleteOnDiskData(
2476
47.6k
      meta->fs_manager()->env(), meta->raft_group_id(), meta->wal_dir(),
2477
47.6k
      meta->fs_manager()->uuid()));
2478
47.6k
  MAYBE_FAULT(FLAGS_TEST_fault_crash_after_wal_deleted);
2479
2480
  // We do not delete the superblock or the consensus metadata when tombstoning
2481
  // a tablet.
2482
47.6k
  if (data_state == TABLET_DATA_TOMBSTONED) {
2483
731
    return Status::OK();
2484
731
  }
2485
2486
  // Only TABLET_DATA_DELETED tablets get this far.
2487
46.9k
  RETURN_NOT_OK(ConsensusMetadata::DeleteOnDiskData(meta->fs_manager(), meta->raft_group_id()));
2488
46.9k
  MAYBE_FAULT(FLAGS_TEST_fault_crash_after_cmeta_deleted);
2489
2490
46.9k
  return Status::OK();
2491
46.9k
}
2492
2493
void LogAndTombstone(const RaftGroupMetadataPtr& meta,
2494
                     const std::string& msg,
2495
                     const std::string& uuid,
2496
                     const Status& s,
2497
3
                     TSTabletManager* ts_manager) {
2498
3
  const string& tablet_id = meta->raft_group_id();
2499
3
  const string kLogPrefix = LogPrefix(tablet_id, uuid);
2500
3
  LOG(WARNING) << kLogPrefix << msg << ": " << s.ToString();
2501
2502
  // Tombstone the tablet when remote bootstrap fails.
2503
3
  LOG(INFO) << kLogPrefix << "Tombstoning tablet after failed remote bootstrap";
2504
3
  Status delete_status = DeleteTabletData(meta,
2505
3
                                          TABLET_DATA_TOMBSTONED,
2506
3
                                          uuid,
2507
3
                                          yb::OpId(),
2508
3
                                          ts_manager);
2509
2510
3
  if (PREDICT_FALSE(FLAGS_TEST_sleep_after_tombstoning_tablet_secs > 0)) {
2511
    // We sleep here so that the test can verify that the state of the tablet is
2512
    // TABLET_DATA_TOMBSTONED.
2513
0
    LOG(INFO) << "Sleeping after remote bootstrap failed";
2514
0
    SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_sleep_after_tombstoning_tablet_secs));
2515
0
  }
2516
2517
3
  if (PREDICT_FALSE(!delete_status.ok())) {
2518
    // This failure should only either indicate a bug or an IO error.
2519
0
    LOG(FATAL) << kLogPrefix << "Failed to tombstone tablet after remote bootstrap: "
2520
0
               << delete_status.ToString();
2521
0
  }
2522
2523
  // Remove the child tracker if present.
2524
3
  if (ts_manager != nullptr) {
2525
2
    auto tracker = MemTracker::FindTracker(
2526
2
        Format("tablet-$0", meta->raft_group_id()), ts_manager->server()->mem_tracker());
2527
2
    if (tracker) {
2528
0
      tracker->UnregisterFromParent();
2529
0
    }
2530
2
  }
2531
3
}
2532
2533
TransitionInProgressDeleter::TransitionInProgressDeleter(
2534
    TransitionInProgressMap* map, std::mutex* mutex, const TabletId& tablet_id)
2535
131k
    : in_progress_(map), mutex_(mutex), tablet_id_(tablet_id) {}
2536
2537
131k
TransitionInProgressDeleter::~TransitionInProgressDeleter() {
2538
131k
  std::string transition;
2539
131k
  {
2540
131k
    std::unique_lock<std::mutex> lock(*mutex_);
2541
131k
    const auto iter = in_progress_->find(tablet_id_);
2542
131k
    CHECK(iter != in_progress_->end());
2543
131k
    transition = iter->second;
2544
131k
    in_progress_->erase(iter);
2545
131k
  }
2546
131k
  LOG(INFO) << "Deleted transition in progress " << transition
2547
131k
            << " for tablet " << tablet_id_;
2548
131k
}
2549
2550
Status ShutdownAndTombstoneTabletPeerNotOk(
2551
    const Status& status, const tablet::TabletPeerPtr& tablet_peer,
2552
    const tablet::RaftGroupMetadataPtr& meta, const std::string& uuid, const char* msg,
2553
995
    TSTabletManager* ts_tablet_manager) {
2554
995
  if (status.ok()) {
2555
994
    return status;
2556
994
  }
2557
  // If shutdown was initiated by someone else we should not wait for shutdown to complete.
2558
1
  if (tablet_peer && tablet_peer->StartShutdown()) {
2559
0
    tablet_peer->CompleteShutdown();
2560
0
  }
2561
1
  tserver::LogAndTombstone(meta, msg, uuid, status, ts_tablet_manager);
2562
1
  return status;
2563
1
}
2564
2565
} // namespace tserver
2566
} // namespace yb