YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/tserver/ts_tablet_manager.cc
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// The following only applies to changes made to this file as part of YugaByte development.
19
//
20
// Portions Copyright (c) YugaByte, Inc.
21
//
22
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
23
// in compliance with the License.  You may obtain a copy of the License at
24
//
25
// http://www.apache.org/licenses/LICENSE-2.0
26
//
27
// Unless required by applicable law or agreed to in writing, software distributed under the License
28
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
29
// or implied.  See the License for the specific language governing permissions and limitations
30
// under the License.
31
//
32
33
#include "yb/tserver/ts_tablet_manager.h"
34
35
#include <algorithm>
36
#include <chrono>
37
#include <memory>
38
#include <mutex>
39
#include <string>
40
#include <thread>
41
#include <vector>
42
43
#include <boost/container/static_vector.hpp>
44
#include <boost/optional/optional.hpp>
45
#include <glog/logging.h>
46
47
#include "yb/client/client.h"
48
49
#include "yb/common/wire_protocol.h"
50
51
#include "yb/consensus/consensus.h"
52
#include "yb/consensus/multi_raft_batcher.h"
53
#include "yb/consensus/consensus_meta.h"
54
#include "yb/consensus/log.h"
55
#include "yb/consensus/log_anchor_registry.h"
56
#include "yb/consensus/metadata.pb.h"
57
#include "yb/consensus/opid_util.h"
58
#include "yb/consensus/quorum_util.h"
59
#include "yb/consensus/raft_consensus.h"
60
#include "yb/consensus/retryable_requests.h"
61
#include "yb/consensus/state_change_context.h"
62
63
#include "yb/docdb/docdb_rocksdb_util.h"
64
65
#include "yb/fs/fs_manager.h"
66
67
#include "yb/gutil/bind.h"
68
#include "yb/gutil/stl_util.h"
69
#include "yb/gutil/strings/substitute.h"
70
#include "yb/gutil/sysinfo.h"
71
72
#include "yb/master/master_heartbeat.pb.h"
73
#include "yb/master/sys_catalog.h"
74
75
#include "yb/rpc/messenger.h"
76
#include "yb/rpc/poller.h"
77
78
#include "yb/tablet/metadata.pb.h"
79
#include "yb/tablet/operations/split_operation.h"
80
#include "yb/tablet/tablet.h"
81
#include "yb/tablet/tablet.pb.h"
82
#include "yb/tablet/tablet_bootstrap_if.h"
83
#include "yb/tablet/tablet_metadata.h"
84
#include "yb/tablet/tablet_options.h"
85
#include "yb/tablet/tablet_peer.h"
86
87
#include "yb/tserver/heartbeater.h"
88
#include "yb/tserver/remote_bootstrap_client.h"
89
#include "yb/tserver/remote_bootstrap_session.h"
90
#include "yb/tserver/tablet_server.h"
91
#include "yb/tserver/tserver.pb.h"
92
93
#include "yb/util/debug/long_operation_tracker.h"
94
#include "yb/util/debug/trace_event.h"
95
#include "yb/util/env.h"
96
#include "yb/util/fault_injection.h"
97
#include "yb/util/flag_tags.h"
98
#include "yb/util/format.h"
99
#include "yb/util/logging.h"
100
#include "yb/util/mem_tracker.h"
101
#include "yb/util/metrics.h"
102
#include "yb/util/pb_util.h"
103
#include "yb/util/scope_exit.h"
104
#include "yb/util/shared_lock.h"
105
#include "yb/util/status_format.h"
106
#include "yb/util/status_log.h"
107
#include "yb/util/stopwatch.h"
108
#include "yb/util/trace.h"
109
110
using namespace std::literals;
111
using namespace std::placeholders;
112
113
DEFINE_int32(num_tablets_to_open_simultaneously, 0,
114
             "Number of threads available to open tablets during startup. If this "
115
             "is set to 0 (the default), then the number of bootstrap threads will "
116
             "be set based on the number of data directories. If the data directories "
117
             "are on some very fast storage device such as SSD or a RAID array, it "
118
             "may make sense to manually tune this.");
119
TAG_FLAG(num_tablets_to_open_simultaneously, advanced);
120
121
DEFINE_int32(tablet_start_warn_threshold_ms, 500,
122
             "If a tablet takes more than this number of millis to start, issue "
123
             "a warning with a trace.");
124
TAG_FLAG(tablet_start_warn_threshold_ms, hidden);
125
126
DEFINE_int32(cleanup_split_tablets_interval_sec, 60,
127
             "Interval at which tablet manager tries to cleanup split tablets which are no longer "
128
             "needed. Setting this to 0 disables cleanup of split tablets.");
129
130
DEFINE_test_flag(double, fault_crash_after_blocks_deleted, 0.0,
131
                 "Fraction of the time when the tablet will crash immediately "
132
                 "after deleting the data blocks during tablet deletion.");
133
134
DEFINE_test_flag(double, fault_crash_after_wal_deleted, 0.0,
135
                 "Fraction of the time when the tablet will crash immediately "
136
                 "after deleting the WAL segments during tablet deletion.");
137
138
DEFINE_test_flag(double, fault_crash_after_cmeta_deleted, 0.0,
139
                 "Fraction of the time when the tablet will crash immediately "
140
                 "after deleting the consensus metadata during tablet deletion.");
141
142
DEFINE_test_flag(double, fault_crash_after_rb_files_fetched, 0.0,
143
                 "Fraction of the time when the tablet will crash immediately "
144
                 "after fetching the files during a remote bootstrap but before "
145
                 "marking the superblock as TABLET_DATA_READY.");
146
147
DEFINE_test_flag(double, fault_crash_in_split_after_log_copied, 0.0,
148
                 "Fraction of the time when the tablet will crash immediately after initiating a "
149
                 "Log::CopyTo from parent to child tablet, but before marking the child tablet as "
150
                 "TABLET_DATA_READY.");
151
152
DEFINE_test_flag(bool, simulate_already_present_in_remote_bootstrap, false,
153
                 "If true, return an AlreadyPresent error in remote bootstrap after starting the "
154
                 "remote bootstrap client.");
155
156
DEFINE_test_flag(double, fault_crash_in_split_before_log_flushed, 0.0,
157
                 "Fraction of the time when the tablet will crash immediately before flushing a "
158
                 "parent tablet's kSplit operation.");
159
160
DEFINE_test_flag(uint64, crash_if_remote_bootstrap_sessions_greater_than, 0,
161
                 "If greater than zero, this process will crash if we detect more than the "
162
                 "specified number of remote bootstrap sessions.");
163
164
DEFINE_test_flag(uint64, crash_if_remote_bootstrap_sessions_per_table_greater_than, 0,
165
                 "If greater than zero, this process will crash if for any table we exceed the "
166
                 "specified number of remote bootstrap sessions");
167
168
DEFINE_test_flag(bool, crash_before_apply_tablet_split_op, false,
169
                 "Crash inside TSTabletManager::ApplyTabletSplit before doing anything");
170
171
DEFINE_test_flag(bool, force_single_tablet_failure, false,
172
                 "Force exactly one tablet to a failed state.");
173
174
DEFINE_test_flag(int32, apply_tablet_split_inject_delay_ms, 0,
175
                 "Inject delay into TSTabletManager::ApplyTabletSplit.");
176
177
DEFINE_test_flag(bool, skip_deleting_split_tablets, false,
178
                 "Skip deleting tablets which have been split.");
179
180
DEFINE_test_flag(bool, skip_post_split_compaction, false,
181
                 "Skip processing post split compaction.");
182
183
DEFINE_int32(verify_tablet_data_interval_sec, 0,
184
             "The tick interval time for the tablet data integrity verification background task. "
185
             "This defaults to 0, which means disable the background task.");
186
187
DEFINE_int32(cleanup_metrics_interval_sec, 60,
188
             "The tick interval time for the metrics cleanup background task. "
189
             "If set to 0, it disables the background task.");
190
191
DEFINE_bool(skip_tablet_data_verification, false,
192
            "Skip checking tablet data for corruption.");
193
194
DEFINE_int32(read_pool_max_threads, 128,
195
             "The maximum number of threads allowed for read_pool_. This pool is used "
196
             "to run multiple read operations, that are part of the same tablet rpc, "
197
             "in parallel.");
198
DEFINE_int32(read_pool_max_queue_size, 128,
199
             "The maximum number of tasks that can be held in the queue for read_pool_. This pool "
200
             "is used to run multiple read operations, that are part of the same tablet rpc, "
201
             "in parallel.");
202
203
DEFINE_int32(post_split_trigger_compaction_pool_max_threads, 1,
204
             "The maximum number of threads allowed for post_split_trigger_compaction_pool_. This "
205
             "pool is used to run compactions on tablets after they have been split and still "
206
             "contain irrelevant data from the tablet they were sourced from.");
207
DEFINE_int32(post_split_trigger_compaction_pool_max_queue_size, 16,
208
             "The maximum number of tasks that can be held in the pool for "
209
             "post_split_trigger_compaction_pool_. This pool is used to run compactions on tablets "
210
             "after they have been split and still contain irrelevant data from the tablet they "
211
             "were sourced from.");
212
213
DEFINE_test_flag(int32, sleep_after_tombstoning_tablet_secs, 0,
214
                 "Whether we sleep in LogAndTombstone after calling DeleteTabletData.");
215
216
constexpr int kTServerYbClientDefaultTimeoutMs = 60 * 1000;
217
218
DEFINE_int32(tserver_yb_client_default_timeout_ms, kTServerYbClientDefaultTimeoutMs,
219
             "Default timeout for the YBClient embedded into the tablet server that is used "
220
             "for distributed transactions.");
221
222
DEFINE_bool(enable_restart_transaction_status_tablets_first, true,
223
            "Set to true to prioritize bootstrapping transaction status tablets first.");
224
225
DECLARE_string(rocksdb_compact_flush_rate_limit_sharing_mode);
226
227
namespace yb {
228
namespace tserver {
229
230
METRIC_DEFINE_coarse_histogram(server, op_apply_queue_length, "Operation Apply Queue Length",
231
                        MetricUnit::kTasks,
232
                        "Number of operations waiting to be applied to the tablet. "
233
                        "High queue lengths indicate that the server is unable to process "
234
                        "operations as fast as they are being written to the WAL.");
235
236
METRIC_DEFINE_coarse_histogram(server, op_apply_queue_time, "Operation Apply Queue Time",
237
                        MetricUnit::kMicroseconds,
238
                        "Time that operations spent waiting in the apply queue before being "
239
                        "processed. High queue times indicate that the server is unable to "
240
                        "process operations as fast as they are being written to the WAL.");
241
242
METRIC_DEFINE_coarse_histogram(server, op_apply_run_time, "Operation Apply Run Time",
243
                        MetricUnit::kMicroseconds,
244
                        "Time that operations spent being applied to the tablet. "
245
                        "High values may indicate that the server is under-provisioned or "
246
                        "that operations consist of very large batches.");
247
248
METRIC_DEFINE_coarse_histogram(server, op_read_queue_length, "Operation Read op Queue Length",
249
                        MetricUnit::kTasks,
250
                        "Number of operations waiting to be applied to the tablet. "
251
                            "High queue lengths indicate that the server is unable to process "
252
                            "operations as fast as they are being written to the WAL.");
253
254
METRIC_DEFINE_coarse_histogram(server, op_read_queue_time, "Operation Read op Queue Time",
255
                        MetricUnit::kMicroseconds,
256
                        "Time that operations spent waiting in the read queue before being "
257
                            "processed. High queue times indicate that the server is unable to "
258
                            "process operations as fast as they are being written to the WAL.");
259
260
METRIC_DEFINE_coarse_histogram(server, op_read_run_time, "Operation Read op Run Time",
261
                        MetricUnit::kMicroseconds,
262
                        "Time that operations spent being applied to the tablet. "
263
                            "High values may indicate that the server is under-provisioned or "
264
                            "that operations consist of very large batches.");
265
266
METRIC_DEFINE_coarse_histogram(server, ts_bootstrap_time, "TServer Bootstrap Time",
267
                        MetricUnit::kMicroseconds,
268
                        "Time that the tablet server takes to bootstrap all of its tablets.");
269
270
THREAD_POOL_METRICS_DEFINE(
271
    server, post_split_trigger_compaction_pool, "Thread pool for tablet compaction jobs.");
272
273
THREAD_POOL_METRICS_DEFINE(
274
    server, admin_triggered_compaction_pool, "Thread pool for tablet compaction jobs.");
275
276
using consensus::ConsensusMetadata;
277
using consensus::ConsensusStatePB;
278
using consensus::RaftConfigPB;
279
using consensus::RaftPeerPB;
280
using consensus::StartRemoteBootstrapRequestPB;
281
using log::Log;
282
using master::ReportedTabletPB;
283
using master::TabletReportPB;
284
using master::TabletReportUpdatesPB;
285
using std::shared_ptr;
286
using std::string;
287
using std::unordered_set;
288
using std::vector;
289
using strings::Substitute;
290
using tablet::BOOTSTRAPPING;
291
using tablet::NOT_STARTED;
292
using tablet::RaftGroupMetadata;
293
using tablet::RaftGroupMetadataPtr;
294
using tablet::RaftGroupStatePB;
295
using tablet::RUNNING;
296
using tablet::TABLET_DATA_COPYING;
297
using tablet::TABLET_DATA_DELETED;
298
using tablet::TABLET_DATA_INIT_STARTED;
299
using tablet::TABLET_DATA_READY;
300
using tablet::TABLET_DATA_SPLIT_COMPLETED;
301
using tablet::TABLET_DATA_TOMBSTONED;
302
using tablet::TabletDataState;
303
using tablet::TabletPeer;
304
using tablet::TabletPeerPtr;
305
using tablet::TabletStatusListener;
306
using tablet::TabletStatusPB;
307
308
constexpr int32_t kDefaultTserverBlockCacheSizePercentage = 50;
309
310
0
void TSTabletManager::VerifyTabletData() {
311
0
  LOG_WITH_PREFIX(INFO) << "Beginning tablet data verification checks";
312
0
  for (const TabletPeerPtr& peer : GetTabletPeers()) {
313
0
    if (peer->state() == RUNNING) {
314
0
      if (PREDICT_FALSE(FLAGS_skip_tablet_data_verification)) {
315
0
        LOG_WITH_PREFIX(INFO)
316
0
            << Format("Skipped tablet data verification check on $0", peer->tablet_id());
317
0
      } else {
318
0
        Status s = peer->tablet()->VerifyDataIntegrity();
319
0
        if (!s.ok()) {
320
0
          LOG(WARNING) << "Tablet data integrity verification failed on " << peer->tablet_id()
321
0
                       << ": " << s;
322
0
        }
323
0
      }
324
0
    }
325
0
  }
326
0
}
327
328
88.0k
void TSTabletManager::CleanupOldMetrics() {
329
88.0k
  VLOG
(2) << "Cleaning up old metrics"0
;
330
88.0k
  metric_registry_->RetireOldMetrics();
331
88.0k
}
332
333
TSTabletManager::TSTabletManager(FsManager* fs_manager,
334
                                 TabletServer* server,
335
                                 MetricRegistry* metric_registry)
336
  : fs_manager_(fs_manager),
337
    server_(server),
338
    metric_registry_(metric_registry),
339
9.27k
    state_(MANAGER_INITIALIZING) {
340
9.27k
  ThreadPoolMetrics metrics = {
341
9.27k
      METRIC_op_apply_queue_length.Instantiate(server_->metric_entity()),
342
9.27k
      METRIC_op_apply_queue_time.Instantiate(server_->metric_entity()),
343
9.27k
      METRIC_op_apply_run_time.Instantiate(server_->metric_entity())
344
9.27k
  };
345
9.27k
  tablet_options_.ServerMetricEntity = server_->metric_entity();
346
9.27k
  CHECK_OK(ThreadPoolBuilder("apply")
347
9.27k
               .set_metrics(std::move(metrics))
348
9.27k
               .Build(&apply_pool_));
349
350
  // This pool is shared by all replicas hosted by this server.
351
  //
352
  // Some submitted tasks use blocking IO, so we configure no upper bound on
353
  // the maximum number of threads in each pool (otherwise the default value of
354
  // "number of CPUs" may cause blocking tasks to starve other "fast" tasks).
355
  // However, the effective upper bound is the number of replicas as each will
356
  // submit its own tasks via a dedicated token.
357
9.27k
  CHECK_OK(ThreadPoolBuilder("consensus")
358
9.27k
               .set_min_threads(1)
359
9.27k
               .unlimited_threads()
360
9.27k
               .Build(&raft_pool_));
361
9.27k
  CHECK_OK(ThreadPoolBuilder("prepare")
362
9.27k
               .set_min_threads(1)
363
9.27k
               .unlimited_threads()
364
9.27k
               .Build(&tablet_prepare_pool_));
365
9.27k
  CHECK_OK(ThreadPoolBuilder("append")
366
9.27k
               .set_min_threads(1)
367
9.27k
               .unlimited_threads()
368
9.27k
               .set_idle_timeout(MonoDelta::FromMilliseconds(10000))
369
9.27k
               .Build(&append_pool_));
370
9.27k
  CHECK_OK(ThreadPoolBuilder("log-alloc")
371
9.27k
               .set_min_threads(1)
372
9.27k
               .unlimited_threads()
373
9.27k
               .Build(&allocation_pool_));
374
9.27k
  ThreadPoolMetrics read_metrics = {
375
9.27k
      METRIC_op_read_queue_length.Instantiate(server_->metric_entity()),
376
9.27k
      METRIC_op_read_queue_time.Instantiate(server_->metric_entity()),
377
9.27k
      METRIC_op_read_run_time.Instantiate(server_->metric_entity())
378
9.27k
  };
379
9.27k
  CHECK_OK(ThreadPoolBuilder("read-parallel")
380
9.27k
               .set_max_threads(FLAGS_read_pool_max_threads)
381
9.27k
               .set_max_queue_size(FLAGS_read_pool_max_queue_size)
382
9.27k
               .set_metrics(std::move(read_metrics))
383
9.27k
               .Build(&read_pool_));
384
9.27k
  CHECK_OK(ThreadPoolBuilder("tablet-split-compaction")
385
9.27k
              .set_max_threads(FLAGS_post_split_trigger_compaction_pool_max_threads)
386
9.27k
              .set_max_queue_size(FLAGS_post_split_trigger_compaction_pool_max_queue_size)
387
9.27k
              .set_metrics(THREAD_POOL_METRICS_INSTANCE(
388
9.27k
                  server_->metric_entity(), post_split_trigger_compaction_pool))
389
9.27k
              .Build(&post_split_trigger_compaction_pool_));
390
9.27k
  CHECK_OK(ThreadPoolBuilder("admin-compaction")
391
9.27k
              .set_max_threads(std::max(docdb::GetGlobalRocksDBPriorityThreadPoolSize(), 0))
392
9.27k
              .set_metrics(THREAD_POOL_METRICS_INSTANCE(
393
9.27k
                  server_->metric_entity(), admin_triggered_compaction_pool))
394
9.27k
              .Build(&admin_triggered_compaction_pool_));
395
396
9.27k
  mem_manager_ = std::make_shared<TabletMemoryManager>(
397
9.27k
      &tablet_options_,
398
9.27k
      server_->mem_tracker(),
399
9.27k
      kDefaultTserverBlockCacheSizePercentage,
400
9.27k
      server_->metric_entity(),
401
9.27k
      [this]()
{ return GetTabletPeers(); }410
);
402
9.27k
}
403
404
164
TSTabletManager::~TSTabletManager() {
405
164
}
406
407
8.74k
Status TSTabletManager::Init() {
408
8.74k
  CHECK_EQ(state(), MANAGER_INITIALIZING);
409
410
8.74k
  async_client_init_.emplace(
411
8.74k
      "tserver_client", 0 /* num_reactors */,
412
8.74k
      FLAGS_tserver_yb_client_default_timeout_ms / 1000, server_->permanent_uuid(),
413
8.74k
      &server_->options(), server_->metric_entity(), server_->mem_tracker(),
414
8.74k
      server_->messenger());
415
416
8.74k
  async_client_init_->AddPostCreateHook([this](client::YBClient* client) {
417
7.97k
    auto* tserver = server();
418
7.97k
    if (tserver != nullptr && tserver->proxy() != nullptr) {
419
7.97k
      client->SetLocalTabletServer(tserver->permanent_uuid(), tserver->proxy(), tserver);
420
7.97k
    }
421
7.97k
  });
422
423
8.74k
  tablet_options_.env = server_->GetEnv();
424
8.74k
  tablet_options_.rocksdb_env = server_->GetRocksDBEnv();
425
8.74k
  tablet_options_.listeners = server_->options().listeners;
426
8.74k
  if (docdb::GetRocksDBRateLimiterSharingMode() == docdb::RateLimiterSharingMode::TSERVER) {
427
8.74k
    tablet_options_.rate_limiter = docdb::CreateRocksDBRateLimiter();
428
8.74k
  }
429
430
  // Start the threadpool we'll use to open tablets.
431
  // This has to be done in Init() instead of the constructor, since the
432
  // FsManager isn't initialized until this point.
433
8.74k
  int max_bootstrap_threads = FLAGS_num_tablets_to_open_simultaneously;
434
8.74k
  if (max_bootstrap_threads == 0) {
435
8.74k
    int num_cpus = base::NumCPUs();
436
8.74k
    if (num_cpus <= 2) {
437
0
      max_bootstrap_threads = 2;
438
8.74k
    } else {
439
8.74k
      max_bootstrap_threads = min(
440
8.74k
          num_cpus - 1, narrow_cast<int>(fs_manager_->GetDataRootDirs().size()) * 8);
441
8.74k
    }
442
8.74k
    LOG_WITH_PREFIX(INFO) <<  "max_bootstrap_threads=" << max_bootstrap_threads;
443
8.74k
  }
444
8.74k
  ThreadPoolMetrics bootstrap_metrics = {
445
8.74k
          nullptr,
446
8.74k
          nullptr,
447
8.74k
          METRIC_ts_bootstrap_time.Instantiate(server_->metric_entity())
448
8.74k
  };
449
8.74k
  RETURN_NOT_OK(ThreadPoolBuilder("tablet-bootstrap")
450
8.74k
                .set_max_threads(max_bootstrap_threads)
451
8.74k
                .set_metrics(std::move(bootstrap_metrics))
452
8.74k
                .Build(&open_tablet_pool_));
453
454
8.74k
  CleanupCheckpoints();
455
456
  // Search for tablets in the metadata dir.
457
8.74k
  vector<string> tablet_ids;
458
8.74k
  RETURN_NOT_OK(fs_manager_->ListTabletIds(&tablet_ids));
459
460
8.74k
  InitLocalRaftPeerPB();
461
462
8.74k
  multi_raft_manager_ = std::make_unique<consensus::MultiRaftManager>(server_->messenger(),
463
8.74k
                                                                      &server_->proxy_cache(),
464
8.74k
                                                                      local_peer_pb_.cloud_info());
465
466
8.74k
  deque<RaftGroupMetadataPtr> metas;
467
468
  // First, load all of the tablet metadata. We do this before we start
469
  // submitting the actual OpenTablet() tasks so that we don't have to compete
470
  // for disk resources, etc, with bootstrap processes and running tablets.
471
8.74k
  MonoTime start(MonoTime::Now());
472
8.74k
  for (const string& tablet_id : tablet_ids) {
473
217
    RaftGroupMetadataPtr meta;
474
217
    RETURN_NOT_OK_PREPEND(OpenTabletMeta(tablet_id, &meta),
475
217
                          "Failed to open tablet metadata for tablet: " + tablet_id);
476
217
    if (PREDICT_FALSE(!CanServeTabletData(meta->tablet_data_state()))) {
477
8
      RETURN_NOT_OK(HandleNonReadyTabletOnStartup(meta));
478
8
      continue;
479
8
    }
480
209
    RegisterDataAndWalDir(
481
209
        fs_manager_, meta->table_id(), meta->raft_group_id(), meta->data_root_dir(),
482
209
        meta->wal_root_dir());
483
209
    if (FLAGS_enable_restart_transaction_status_tablets_first) {
484
      // Prioritize bootstrapping transaction status tablets first.
485
209
      if (meta->table_type() == TRANSACTION_STATUS_TABLE_TYPE) {
486
21
        metas.push_front(meta);
487
188
      } else {
488
188
        metas.push_back(meta);
489
188
      }
490
209
    } else {
491
0
      metas.push_back(meta);
492
0
    }
493
209
  }
494
495
8.74k
  MonoDelta elapsed = MonoTime::Now().GetDeltaSince(start);
496
8.74k
  LOG(INFO) << "Loaded metadata for " << tablet_ids.size() << " tablet in "
497
8.74k
            << elapsed.ToMilliseconds() << " ms";
498
499
  // Now submit the "Open" task for each.
500
8.74k
  for (const RaftGroupMetadataPtr& meta : metas) {
501
209
    scoped_refptr<TransitionInProgressDeleter> deleter;
502
209
    RETURN_NOT_OK(StartTabletStateTransition(
503
209
        meta->raft_group_id(), "opening tablet", &deleter));
504
505
209
    TabletPeerPtr tablet_peer = VERIFY_RESULT(CreateAndRegisterTabletPeer(meta, NEW_PEER));
506
209
    RETURN_NOT_OK(open_tablet_pool_->SubmitFunc(
507
209
        std::bind(&TSTabletManager::OpenTablet, this, meta, deleter)));
508
209
  }
509
510
8.74k
  {
511
8.74k
    std::lock_guard<RWMutex> lock(mutex_);
512
8.74k
    state_ = MANAGER_RUNNING;
513
8.74k
  }
514
515
8.74k
  RETURN_NOT_OK(mem_manager_->Init());
516
517
8.74k
  tablets_cleaner_ = std::make_unique<rpc::Poller>(
518
8.74k
      LogPrefix(), std::bind(&TSTabletManager::CleanupSplitTablets, this));
519
520
8.74k
  verify_tablet_data_poller_ = std::make_unique<rpc::Poller>(
521
8.74k
      LogPrefix(), std::bind(&TSTabletManager::VerifyTabletData, this));
522
523
8.74k
  metrics_cleaner_ = std::make_unique<rpc::Poller>(
524
8.74k
      LogPrefix(), std::bind(&TSTabletManager::CleanupOldMetrics, this));
525
526
8.74k
  return Status::OK();
527
8.74k
}
528
529
8.74k
void TSTabletManager::CleanupCheckpoints() {
530
8.84k
  for (const auto& data_root : fs_manager_->GetDataRootDirs()) {
531
8.84k
    auto tables_dir = JoinPathSegments(data_root, FsManager::kRocksDBDirName);
532
8.84k
    auto tables = fs_manager_->env()->GetChildren(tables_dir, ExcludeDots::kTrue);
533
8.84k
    if (!tables.ok()) {
534
0
      LOG_WITH_PREFIX(WARNING)
535
0
          << "Failed to get tables in " << tables_dir << ": " << tables.status();
536
0
      continue;
537
0
    }
538
8.84k
    for (const auto& table : *tables) {
539
94
      auto table_dir = JoinPathSegments(tables_dir, table);
540
94
      auto tablets = fs_manager_->env()->GetChildren(table_dir, ExcludeDots::kTrue);
541
94
      if (!tablets.ok()) {
542
0
        LOG_WITH_PREFIX(WARNING)
543
0
            << "Failed to get tablets in " << table_dir << ": " << tablets.status();
544
0
        continue;
545
0
      }
546
566
      
for (const auto& tablet : *tablets)94
{
547
566
        auto checkpoints_dir = JoinPathSegments(
548
566
            table_dir, tablet, RemoteBootstrapSession::kCheckpointsDir);
549
566
        if (fs_manager_->env()->FileExists(checkpoints_dir)) {
550
0
          LOG_WITH_PREFIX(INFO) << "Cleaning up checkpoints dir: " << yb::ToString(checkpoints_dir);
551
0
          auto status = fs_manager_->env()->DeleteRecursively(checkpoints_dir);
552
0
          WARN_NOT_OK(status, Format("Cleanup of checkpoints dir $0 failed", checkpoints_dir));
553
0
        }
554
566
      }
555
94
    }
556
8.84k
  }
557
8.74k
}
558
559
8.58k
Status TSTabletManager::Start() {
560
8.58k
  async_client_init_->Start();
561
8.58k
  if (FLAGS_cleanup_split_tablets_interval_sec > 0) {
562
8.58k
    tablets_cleaner_->Start(
563
8.58k
        &server_->messenger()->scheduler(), FLAGS_cleanup_split_tablets_interval_sec * 1s);
564
8.58k
    LOG(INFO) << "Split tablets cleanup monitor started...";
565
8.58k
  } else {
566
0
    LOG(INFO)
567
0
        << "Split tablets cleanup is disabled by cleanup_split_tablets_interval_sec flag set to 0";
568
0
  }
569
8.58k
  if (FLAGS_verify_tablet_data_interval_sec > 0) {
570
0
    verify_tablet_data_poller_->Start(
571
0
        &server_->messenger()->scheduler(), FLAGS_verify_tablet_data_interval_sec * 1s);
572
0
    LOG(INFO) << "Tablet data verification task started...";
573
8.58k
  } else {
574
8.58k
    LOG(INFO)
575
8.58k
        << "Tablet data verification is disabled by verify_tablet_data_interval_sec flag set to 0";
576
8.58k
  }
577
8.58k
  if (FLAGS_cleanup_metrics_interval_sec > 0) {
578
8.58k
    metrics_cleaner_->Start(
579
8.58k
        &server_->messenger()->scheduler(), FLAGS_cleanup_metrics_interval_sec * 1s);
580
8.58k
    LOG(INFO) << "Old metrics cleanup task started...";
581
8.58k
  } else {
582
0
    LOG(INFO)
583
0
        << "Old metrics cleanup is disabled by cleanup_metrics_interval_sec flag set to 0";
584
0
  }
585
586
8.58k
  return Status::OK();
587
8.58k
}
588
589
354k
void TSTabletManager::CleanupSplitTablets() {
590
354k
  
VLOG_WITH_PREFIX_AND_FUNC0
(3) << "looking for tablets to cleanup..."0
;
591
354k
  auto tablet_peers = GetTabletPeers();
592
1.63M
  for (const auto& tablet_peer : tablet_peers) {
593
1.63M
    if (tablet_peer->CanBeDeleted()) {
594
13
      const auto& tablet_id = tablet_peer->tablet_id();
595
13
      if (PREDICT_FALSE(FLAGS_TEST_skip_deleting_split_tablets)) {
596
7
        LOG_WITH_PREFIX(INFO) << Format("Skipped triggering delete of tablet $0", tablet_id);
597
7
      } else {
598
6
        LOG_WITH_PREFIX(INFO) << Format("Triggering delete of tablet $0", tablet_id);
599
6
        client().DeleteNotServingTablet(
600
6
            tablet_peer->tablet_id(), [tablet_id](const Status& status) {
601
6
              LOG(INFO) << Format("Tablet $0 deletion result: $1", tablet_id, status);
602
6
            });
603
6
      }
604
13
    }
605
1.63M
  }
606
354k
}
607
608
14
Status TSTabletManager::WaitForAllBootstrapsToFinish() {
609
14
  CHECK_EQ(state(), MANAGER_RUNNING);
610
611
14
  open_tablet_pool_->Wait();
612
613
14
  Status s = Status::OK();
614
615
14
  SharedLock<RWMutex> shared_lock(mutex_);
616
29
  for (const TabletMap::value_type& entry : tablet_map_) {
617
29
    if (entry.second->state() == tablet::FAILED) {
618
1
      if (s.ok()) {
619
1
        s = entry.second->error();
620
1
      }
621
1
    }
622
29
  }
623
624
14
  return s;
625
14
}
626
627
Result<scoped_refptr<TransitionInProgressDeleter>>
628
139k
TSTabletManager::StartTabletStateTransitionForCreation(const TabletId& tablet_id) {
629
139k
  scoped_refptr<TransitionInProgressDeleter> deleter;
630
139k
  SharedLock<RWMutex> lock(mutex_);
631
139k
  TRACE("Acquired tablet manager lock");
632
633
  // Sanity check that the tablet isn't already registered.
634
139k
  TabletPeerPtr junk;
635
139k
  if (LookupTabletUnlocked(tablet_id, &junk)) {
636
1
    return STATUS(AlreadyPresent, "Tablet already registered", tablet_id);
637
1
  }
638
639
139k
  RETURN_NOT_OK(StartTabletStateTransition(tablet_id, "creating tablet", &deleter));
640
641
139k
  return deleter;
642
139k
}
643
644
Result<TabletPeerPtr> TSTabletManager::CreateNewTablet(
645
    const tablet::TableInfoPtr& table_info,
646
    const string& tablet_id,
647
    const Partition& partition,
648
    RaftConfigPB config,
649
    const bool colocated,
650
139k
    const std::vector<SnapshotScheduleId>& snapshot_schedules) {
651
139k
  if (state() != MANAGER_RUNNING) {
652
1
    return STATUS_FORMAT(IllegalState, "Manager is not running: $0", state());
653
1
  }
654
139k
  CHECK(IsRaftConfigMember(server_->instance_pb().permanent_uuid(), config));
655
656
556k
  for (int i = 0; i < config.peers_size(); 
++i416k
) {
657
416k
    const auto& config_peer = config.peers(i);
658
416k
    CHECK(config_peer.has_member_type());
659
416k
  }
660
661
  // Set the initial opid_index for a RaftConfigPB to -1.
662
139k
  config.set_opid_index(consensus::kInvalidOpIdIndex);
663
664
139k
  scoped_refptr<TransitionInProgressDeleter> deleter =
665
139k
      
VERIFY_RESULT139k
(StartTabletStateTransitionForCreation(tablet_id));139k
666
667
  // Create the metadata.
668
139k
  TRACE("Creating new metadata...");
669
139k
  string data_root_dir;
670
139k
  string wal_root_dir;
671
139k
  GetAndRegisterDataAndWalDir(
672
139k
      fs_manager_, table_info->table_id, tablet_id, &data_root_dir, &wal_root_dir);
673
139k
  auto create_result = RaftGroupMetadata::CreateNew(tablet::RaftGroupMetadataData {
674
139k
    .fs_manager = fs_manager_,
675
139k
    .table_info = table_info,
676
139k
    .raft_group_id = tablet_id,
677
139k
    .partition = partition,
678
139k
    .tablet_data_state = TABLET_DATA_READY,
679
139k
    .colocated = colocated,
680
139k
    .snapshot_schedules = snapshot_schedules,
681
139k
  }, data_root_dir, wal_root_dir);
682
139k
  if (!create_result.ok()) {
683
9
    UnregisterDataWalDir(table_info->table_id, tablet_id, data_root_dir, wal_root_dir);
684
9
  }
685
139k
  RETURN_NOT_OK_PREPEND(create_result, "Couldn't create tablet metadata")
686
139k
  RaftGroupMetadataPtr meta = std::move(*create_result);
687
139k
  LOG(INFO) << TabletLogPrefix(tablet_id)
688
139k
            << "Created tablet metadata for table: " << table_info->table_id;
689
690
  // We must persist the consensus metadata to disk before starting a new
691
  // tablet's TabletPeer and Consensus implementation.
692
139k
  std::unique_ptr<ConsensusMetadata> cmeta;
693
139k
  RETURN_NOT_OK_PREPEND(ConsensusMetadata::Create(fs_manager_, tablet_id, fs_manager_->uuid(),
694
139k
                                                  config, consensus::kMinimumTerm, &cmeta),
695
139k
                        "Unable to create new ConsensusMeta for tablet " + tablet_id);
696
139k
  TabletPeerPtr new_peer = VERIFY_RESULT(CreateAndRegisterTabletPeer(meta, NEW_PEER));
697
698
  // We can run this synchronously since there is nothing to bootstrap.
699
139k
  RETURN_NOT_OK(
700
139k
      open_tablet_pool_->SubmitFunc(std::bind(&TSTabletManager::OpenTablet, this, meta, deleter)));
701
702
139k
  return new_peer;
703
139k
}
704
705
struct TabletCreationMetaData {
706
  TabletId tablet_id;
707
  scoped_refptr<TransitionInProgressDeleter> transition_deleter;
708
  Partition partition;
709
  docdb::KeyBounds key_bounds;
710
  RaftGroupMetadataPtr raft_group_metadata;
711
};
712
713
namespace {
714
715
// Creates SplitTabletsCreationMetaData for two new tablets for `tablet` splitting based on request.
716
SplitTabletsCreationMetaData PrepareTabletCreationMetaDataForSplit(
717
68
    const tablet::SplitTabletRequestPB& request, const tablet::Tablet& tablet) {
718
68
  SplitTabletsCreationMetaData metas;
719
720
68
  const auto& split_partition_key = request.split_partition_key();
721
68
  const auto& split_encoded_key = request.split_encoded_key();
722
723
68
  std::shared_ptr<Partition> source_partition = tablet.metadata()->partition();
724
68
  const auto source_key_bounds = *tablet.doc_db().key_bounds;
725
726
68
  {
727
68
    TabletCreationMetaData meta;
728
68
    meta.tablet_id = request.new_tablet1_id();
729
68
    meta.partition = *source_partition;
730
68
    meta.key_bounds = source_key_bounds;
731
68
    meta.partition.set_partition_key_end(split_partition_key);
732
68
    meta.key_bounds.upper.Reset(split_encoded_key);
733
68
    metas.push_back(meta);
734
68
  }
735
736
68
  {
737
68
    TabletCreationMetaData meta;
738
68
    meta.tablet_id = request.new_tablet2_id();
739
68
    meta.partition = *source_partition;
740
68
    meta.key_bounds = source_key_bounds;
741
68
    meta.partition.set_partition_key_start(split_partition_key);
742
68
    meta.key_bounds.lower.Reset(split_encoded_key);
743
68
    metas.push_back(meta);
744
68
  }
745
746
68
  return metas;
747
68
}
748
749
}  // namespace
750
751
Status TSTabletManager::StartSubtabletsSplit(
752
68
    const RaftGroupMetadata& source_tablet_meta, SplitTabletsCreationMetaData* tcmetas) {
753
68
  auto* const env = fs_manager_->env();
754
755
68
  auto iter = tcmetas->begin();
756
204
  while (iter != tcmetas->end()) {
757
136
    const auto& subtablet_id = iter->tablet_id;
758
759
136
    auto transition_deleter_result = StartTabletStateTransitionForCreation(subtablet_id);
760
136
    if (transition_deleter_result.ok()) {
761
136
      iter->transition_deleter = *transition_deleter_result;
762
136
    } else 
if (0
transition_deleter_result.status().IsAlreadyPresent()0
) {
763
      // State transition for sub tablet with subtablet_id could be already registered because its
764
      // remote bootstrap (from already split parent tablet leader) is in progress.
765
0
      iter = tcmetas->erase(iter);
766
0
      continue;
767
0
    } else {
768
0
      return transition_deleter_result.status();
769
0
    }
770
771
    // Try to load metadata from previous not completed split.
772
136
    auto load_result = RaftGroupMetadata::Load(fs_manager_, subtablet_id);
773
136
    if (load_result.ok() && 
CanServeTabletData((*load_result)->tablet_data_state())2
) {
774
      // Sub tablet has been already created and ready during previous split attempt at this node or
775
      // as a result of remote bootstrap from another node, no need to re-create.
776
0
      iter = tcmetas->erase(iter);
777
0
      continue;
778
0
    }
779
780
    // Delete on-disk data for new tablet IDs in case it is present as a leftover from previously
781
    // failed tablet split attempt.
782
    // TODO(tsplit): add test for that.
783
136
    const auto data_dir = source_tablet_meta.GetSubRaftGroupDataDir(subtablet_id);
784
136
    if (env->FileExists(data_dir)) {
785
0
      RETURN_NOT_OK_PREPEND(
786
0
          env->DeleteRecursively(data_dir),
787
0
          Format("Unable to recursively delete data dir for tablet $0", subtablet_id));
788
0
    }
789
136
    RETURN_NOT_OK(Log::DeleteOnDiskData(
790
136
        env, subtablet_id, source_tablet_meta.GetSubRaftGroupWalDir(subtablet_id),
791
136
        fs_manager_->uuid()));
792
136
    RETURN_NOT_OK(ConsensusMetadata::DeleteOnDiskData(fs_manager_, subtablet_id));
793
794
136
    ++iter;
795
136
  }
796
68
  return Status::OK();
797
68
}
798
799
void TSTabletManager::CreatePeerAndOpenTablet(
800
    const tablet::RaftGroupMetadataPtr& meta,
801
134
    const scoped_refptr<TransitionInProgressDeleter>& deleter) {
802
134
  Status s = ResultToStatus(CreateAndRegisterTabletPeer(meta, NEW_PEER));
803
134
  if (!s.ok()) {
804
0
    s = s.CloneAndPrepend("Failed to create and register tablet peer");
805
0
    if (s.IsShutdownInProgress()) {
806
      // If shutdown is in progress, it is not a failure to not being able to create and register
807
      // tablet peer.
808
0
      LOG_WITH_PREFIX(WARNING) << s;
809
0
    } else {
810
0
      LOG_WITH_PREFIX(DFATAL) << s;
811
0
    }
812
0
    return;
813
0
  }
814
134
  s = open_tablet_pool_->SubmitFunc(std::bind(&TSTabletManager::OpenTablet, this, meta, deleter));
815
134
  if (!s.ok()) {
816
0
    LOG(DFATAL) << Format("Failed to schedule opening tablet $0: $1", meta->table_id(), s);
817
0
    return;
818
0
  }
819
134
}
820
821
70
Status TSTabletManager::ApplyTabletSplit(tablet::SplitOperation* operation, log::Log* raft_log) {
822
70
  if (PREDICT_FALSE(FLAGS_TEST_crash_before_apply_tablet_split_op)) {
823
2
    LOG(FATAL) << "Crashing due to FLAGS_TEST_crash_before_apply_tablet_split_op";
824
2
  }
825
826
70
  if (state() != MANAGER_RUNNING) {
827
0
    return STATUS_FORMAT(IllegalState, "Manager is not running: $0", state());
828
0
  }
829
830
70
  auto* tablet = CHECK_NOTNULL(operation->tablet());
831
70
  const auto tablet_id = tablet->tablet_id();
832
70
  const auto* request = operation->request();
833
70
  SCHECK_EQ(
834
70
      request->tablet_id(), tablet_id, IllegalState,
835
70
      Format(
836
70
          "Unexpected SPLIT_OP $0 designated for tablet $1 to be applied to tablet $2",
837
70
          operation->op_id(), request->tablet_id(), tablet_id));
838
70
  SCHECK(
839
70
      tablet_id != request->new_tablet1_id() && tablet_id != request->new_tablet2_id(),
840
70
      IllegalState,
841
70
      Format(
842
70
          "One of SPLIT_OP $0 destination tablet IDs ($1, $2) is the same as source tablet ID $3",
843
70
          operation->op_id(), request->new_tablet1_id(), request->new_tablet2_id(), tablet_id));
844
845
70
  LOG_WITH_PREFIX(INFO) << "Tablet " << tablet_id << " split operation apply started";
846
847
70
  if (raft_log == nullptr) {
848
68
    auto tablet_peer = VERIFY_RESULT(LookupTablet(tablet_id));
849
0
    raft_log = tablet_peer->raft_consensus()->log().get();
850
68
  }
851
852
70
  MAYBE_FAULT(FLAGS_TEST_fault_crash_in_split_before_log_flushed);
853
854
70
  RETURN_NOT_OK(raft_log->FlushIndex());
855
856
70
  auto& meta = *CHECK_NOTNULL(tablet->metadata());
857
858
  // TODO(tsplit): We can later implement better per-disk distribution during compaction of split
859
  // tablets.
860
70
  const auto table_id = meta.table_id();
861
70
  const auto data_root_dir =
862
70
      VERIFY_RESULT(GetAssignedRootDirForTablet(TabletDirType::kData, table_id, tablet_id));
863
0
  const auto wal_root_dir =
864
70
      VERIFY_RESULT(GetAssignedRootDirForTablet(TabletDirType::kWal, table_id, tablet_id));
865
866
70
  if (FLAGS_TEST_apply_tablet_split_inject_delay_ms > 0) {
867
2
    LOG(INFO) << "TEST: ApplyTabletSplit: injecting delay of "
868
2
              << FLAGS_TEST_apply_tablet_split_inject_delay_ms << " ms for "
869
2
              << AsString(*operation);
870
2
    std::this_thread::sleep_for(FLAGS_TEST_apply_tablet_split_inject_delay_ms * 1ms);
871
2
    LOG(INFO) << "TEST: ApplyTabletSplit: delay finished";
872
2
  }
873
874
70
  auto tcmetas = PrepareTabletCreationMetaDataForSplit(*request, *tablet);
875
876
70
  RETURN_NOT_OK(StartSubtabletsSplit(meta, &tcmetas));
877
878
136
  
for (const auto& tcmeta : tcmetas)70
{
879
136
    RegisterDataAndWalDir(fs_manager_, table_id, tcmeta.tablet_id, data_root_dir, wal_root_dir);
880
136
  }
881
882
70
  bool successfully_completed = false;
883
70
  auto se = ScopeExit([&] {
884
67
    if (!successfully_completed) {
885
0
      for (const auto& tcmeta : tcmetas) {
886
0
        UnregisterDataWalDir(table_id, tcmeta.tablet_id, data_root_dir, wal_root_dir);
887
0
      }
888
0
    }
889
67
  });
890
891
70
  std::unique_ptr<ConsensusMetadata> cmeta;
892
70
  RETURN_NOT_OK(ConsensusMetadata::Load(fs_manager_, tablet_id, fs_manager_->uuid(), &cmeta));
893
894
135
  
for (auto& tcmeta : tcmetas)70
{
895
135
    const auto& new_tablet_id = tcmeta.tablet_id;
896
897
    // Copy raft group metadata.
898
135
    tcmeta.raft_group_metadata = VERIFY_RESULT(tablet->CreateSubtablet(
899
0
        new_tablet_id, tcmeta.partition, tcmeta.key_bounds, operation->op_id(),
900
0
        operation->hybrid_time()));
901
135
    LOG_WITH_PREFIX(INFO) << "Created raft group metadata for table: " << table_id
902
135
                          << " tablet: " << new_tablet_id;
903
904
    // Copy consensus metadata.
905
    // Here we reuse the same cmeta instance for both new tablets. This is safe, because:
906
    // 1) Their consensus metadata only differ by tablet id.
907
    // 2) Flush() will save it into a new path corresponding to tablet id we set before flushing.
908
135
    cmeta->set_tablet_id(new_tablet_id);
909
135
    cmeta->set_split_parent_tablet_id(tablet_id);
910
135
    RETURN_NOT_OK(cmeta->Flush());
911
912
135
    const auto& dest_wal_dir = tcmeta.raft_group_metadata->wal_dir();
913
135
    RETURN_NOT_OK(raft_log->CopyTo(dest_wal_dir));
914
915
135
    MAYBE_FAULT(FLAGS_TEST_fault_crash_in_split_after_log_copied);
916
917
135
    tcmeta.raft_group_metadata->set_tablet_data_state(TABLET_DATA_READY);
918
135
    RETURN_NOT_OK(tcmeta.raft_group_metadata->Flush());
919
135
  }
920
921
70
  meta.SetSplitDone(operation->op_id(), request->new_tablet1_id(), request->new_tablet2_id());
922
70
  RETURN_NOT_OK(meta.Flush());
923
924
70
  tablet->SplitDone();
925
926
134
  for (auto& tcmeta : tcmetas) {
927
    // Call CreatePeerAndOpenTablet asynchronously to avoid write-locking TSTabletManager::mutex_
928
    // here since apply of SPLIT_OP is done under ReplicaState lock and this could lead to deadlock
929
    // in case of reverse lock order in some other thread.
930
    // See https://github.com/yugabyte/yugabyte-db/issues/4312 for more details.
931
134
    RETURN_NOT_OK(apply_pool_->SubmitFunc(std::bind(
932
134
        &TSTabletManager::CreatePeerAndOpenTablet, this, tcmeta.raft_group_metadata,
933
134
        tcmeta.transition_deleter)));
934
134
  }
935
936
70
  successfully_completed = true;
937
70
  LOG_WITH_PREFIX(INFO) << "Tablet " << tablet_id << " split operation has been applied";
938
70
  return Status::OK();
939
70
}
940
941
368k
string LogPrefix(const string& tablet_id, const string& uuid) {
942
368k
  return "T " + tablet_id + " P " + uuid + ": ";
943
368k
}
944
945
Status CheckLeaderTermNotLower(
946
    const string& tablet_id,
947
    const string& uuid,
948
    int64_t leader_term,
949
153
    int64_t last_logged_term) {
950
153
  if (PREDICT_FALSE(leader_term < last_logged_term)) {
951
0
    Status s = STATUS(InvalidArgument,
952
0
        Substitute("Leader has replica of tablet $0 with term $1 lower than last "
953
0
                   "logged term $2 on local replica. Rejecting remote bootstrap request",
954
0
                   tablet_id, leader_term, last_logged_term));
955
0
    LOG(WARNING) << LogPrefix(tablet_id, uuid) << "Remote bootstrap: " << s;
956
0
    return s;
957
0
  }
958
153
  return Status::OK();
959
153
}
960
961
Status HandleReplacingStaleTablet(
962
    RaftGroupMetadataPtr meta,
963
    TabletPeerPtr old_tablet_peer,
964
    const string& tablet_id,
965
    const string& uuid,
966
153
    const int64_t& leader_term) {
967
153
  TabletDataState data_state = meta->tablet_data_state();
968
153
  switch (data_state) {
969
0
    case TABLET_DATA_COPYING: {
970
      // This should not be possible due to the transition_in_progress_ "lock".
971
0
      LOG(FATAL) << LogPrefix(tablet_id, uuid) << " Remote bootstrap: "
972
0
                 << "Found tablet in TABLET_DATA_COPYING state during StartRemoteBootstrap()";
973
0
    }
974
153
    case TABLET_DATA_TOMBSTONED: {
975
153
      RETURN_NOT_OK(old_tablet_peer->CheckShutdownOrNotStarted());
976
153
      int64_t last_logged_term = meta->tombstone_last_logged_opid().term;
977
153
      RETURN_NOT_OK(CheckLeaderTermNotLower(tablet_id,
978
153
                                            uuid,
979
153
                                            leader_term,
980
153
                                            last_logged_term));
981
153
      break;
982
153
    }
983
153
    case TABLET_DATA_SPLIT_COMPLETED:
984
0
    case TABLET_DATA_READY: {
985
0
      if (tablet_id == master::kSysCatalogTabletId) {
986
0
        LOG(FATAL) << LogPrefix(tablet_id, uuid) << " Remote bootstrap: "
987
0
                   << "Found tablet in " << TabletDataState_Name(data_state)
988
0
                   << " state during StartRemoteBootstrap()";
989
0
      }
990
      // There's a valid race here that can lead us to come here:
991
      // 1. Leader sends a second remote bootstrap request as a result of receiving a
992
      // TABLET_NOT_FOUND from this tserver while it was in the middle of a remote bootstrap.
993
      // 2. The remote bootstrap request arrives after the first one is finished, and it is able to
994
      // grab the mutex.
995
      // 3. This tserver finds that it already has the metadata for the tablet, and determines that
996
      // it needs to replace the tablet setting replacing_tablet to true.
997
      // In this case, the master can simply ignore this error.
998
0
      return STATUS_FORMAT(
999
0
          IllegalState, "Tablet $0 in $1 state", tablet_id, TabletDataState_Name(data_state));
1000
0
    }
1001
0
    default: {
1002
0
      return STATUS(IllegalState,
1003
0
          Substitute("Found tablet $0 in unexpected state $1 for remote bootstrap.",
1004
0
                     tablet_id, TabletDataState_Name(data_state)));
1005
0
    }
1006
153
  }
1007
1008
153
  return Status::OK();
1009
153
}
1010
1011
10.3k
Status TSTabletManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB& req) {
1012
  // To prevent racing against Shutdown, we increment this as soon as we start. This should be done
1013
  // before checking for ClosingUnlocked, as on shutdown, we proceed in reverse:
1014
  // - first mark as closing
1015
  // - then wait for num_tablets_being_remote_bootstrapped_ == 0
1016
10.3k
  ++num_tablets_being_remote_bootstrapped_;
1017
10.3k
  auto private_addr = req.source_private_addr()[0].host();
1018
10.3k
  auto decrement_num_rbs_se = ScopeExit([this, &private_addr](){
1019
10.3k
    {
1020
10.3k
      std::lock_guard<RWMutex> lock(mutex_);
1021
10.3k
      auto iter = bootstrap_source_addresses_.find(private_addr);
1022
10.3k
      if (iter != bootstrap_source_addresses_.end()) {
1023
10.0k
        bootstrap_source_addresses_.erase(iter);
1024
10.0k
      }
1025
10.3k
    }
1026
10.3k
    --num_tablets_being_remote_bootstrapped_;
1027
10.3k
  });
1028
1029
10.3k
  LongOperationTracker tracker("StartRemoteBootstrap", 5s);
1030
1031
10.3k
  const string& tablet_id = req.tablet_id();
1032
10.3k
  const string& bootstrap_peer_uuid = req.bootstrap_peer_uuid();
1033
10.3k
  HostPort bootstrap_peer_addr = HostPortFromPB(DesiredHostPort(
1034
10.3k
      req.source_broadcast_addr(), req.source_private_addr(), req.source_cloud_info(),
1035
10.3k
      server_->MakeCloudInfoPB()));
1036
10.3k
  int64_t leader_term = req.caller_term();
1037
1038
10.3k
  const string kLogPrefix = TabletLogPrefix(tablet_id);
1039
1040
10.3k
  TabletPeerPtr old_tablet_peer;
1041
10.3k
  RaftGroupMetadataPtr meta;
1042
10.3k
  bool replacing_tablet = false;
1043
10.3k
  scoped_refptr<TransitionInProgressDeleter> deleter;
1044
10.3k
  {
1045
10.3k
    std::lock_guard<RWMutex> lock(mutex_);
1046
10.3k
    bootstrap_source_addresses_.emplace(private_addr);
1047
10.3k
    if (ClosingUnlocked()) {
1048
0
      auto result = STATUS_FORMAT(
1049
0
          IllegalState, "StartRemoteBootstrap in wrong state: $0",
1050
0
          TSTabletManagerStatePB_Name(state_));
1051
0
      LOG(WARNING) << kLogPrefix << result;
1052
0
      return result;
1053
0
    }
1054
1055
10.3k
    if (LookupTabletUnlocked(tablet_id, &old_tablet_peer)) {
1056
8.58k
      meta = old_tablet_peer->tablet_metadata();
1057
8.58k
      replacing_tablet = true;
1058
8.58k
    }
1059
10.3k
    RETURN_NOT_OK(StartTabletStateTransition(
1060
10.3k
        tablet_id, Substitute("remote bootstrapping tablet from peer $0", bootstrap_peer_uuid),
1061
10.3k
        &deleter));
1062
10.3k
  }
1063
1064
1.96k
  if (replacing_tablet) {
1065
    // Make sure the existing tablet peer is shut down and tombstoned.
1066
153
    RETURN_NOT_OK(HandleReplacingStaleTablet(meta,
1067
153
                                             old_tablet_peer,
1068
153
                                             tablet_id,
1069
153
                                             fs_manager_->uuid(),
1070
153
                                             leader_term));
1071
153
  }
1072
1073
1.96k
  string init_msg = kLogPrefix + Substitute("Initiating remote bootstrap from Peer $0 ($1)",
1074
1.96k
                                            bootstrap_peer_uuid, bootstrap_peer_addr.ToString());
1075
1.96k
  LOG(INFO) << init_msg;
1076
1.96k
  TRACE(init_msg);
1077
1078
1.96k
  auto rb_client = std::make_unique<RemoteBootstrapClient>(tablet_id, fs_manager_);
1079
1080
  // Download and persist the remote superblock in TABLET_DATA_COPYING state.
1081
1.96k
  if (replacing_tablet) {
1082
153
    RETURN_NOT_OK(rb_client->SetTabletToReplace(meta, leader_term));
1083
153
  }
1084
1.96k
  RETURN_NOT_OK(rb_client->Start(bootstrap_peer_uuid,
1085
1.96k
                                 &server_->proxy_cache(),
1086
1.96k
                                 bootstrap_peer_addr,
1087
1.96k
                                 &meta,
1088
1.96k
                                 this));
1089
1090
  // From this point onward, the superblock is persisted in TABLET_DATA_COPYING
1091
  // state, and we need to tombstone the tablet if additional steps prior to
1092
  // getting to a TABLET_DATA_READY state fail.
1093
1094
1.96k
  if (PREDICT_FALSE(FLAGS_TEST_simulate_already_present_in_remote_bootstrap)) {
1095
2
    LOG_WITH_PREFIX(INFO)
1096
2
        << "Simulating AlreadyPresent error in TSTabletManager::StartRemoteBootstrap.";
1097
2
    return STATUS(AlreadyPresent, "failed");
1098
2
  }
1099
1100
  // Registering a non-initialized TabletPeer offers visibility through the Web UI.
1101
1.96k
  TabletPeerPtr tablet_peer = VERIFY_RESULT(
1102
1.96k
        CreateAndRegisterTabletPeer(meta, replacing_tablet ? REPLACEMENT_PEER : NEW_PEER));
1103
0
  MarkTabletBeingRemoteBootstrapped(tablet_peer->tablet_id(),
1104
1.96k
      tablet_peer->tablet_metadata()->table_id());
1105
1106
  // TODO: If we ever make this method asynchronous, we need to move this code somewhere else.
1107
1.96k
  auto se = ScopeExit([this, tablet_peer] {
1108
1.94k
    UnmarkTabletBeingRemoteBootstrapped(tablet_peer->tablet_id(),
1109
1.94k
        tablet_peer->tablet_metadata()->table_id());
1110
1.94k
  });
1111
1112
  // Download all of the remote files.
1113
1.96k
  TOMBSTONE_NOT_OK(rb_client->FetchAll(tablet_peer->status_listener()),
1114
1.96k
                   meta,
1115
1.96k
                   fs_manager_->uuid(),
1116
1.96k
                   "Remote bootstrap: Unable to fetch data from remote peer " +
1117
1.96k
                       bootstrap_peer_uuid + " (" + bootstrap_peer_addr.ToString() + ")",
1118
1.96k
                   this);
1119
1120
1.95k
  MAYBE_FAULT(FLAGS_TEST_fault_crash_after_rb_files_fetched);
1121
1122
  // Write out the last files to make the new replica visible and update the
1123
  // TabletDataState in the superblock to TABLET_DATA_READY.
1124
  // Finish() will call EndRemoteSession() and wait for the leader to successfully submit a
1125
  // ChangeConfig request (to change this server's role from PRE_VOTER or PRE_OBSERVER to VOTER or
1126
  // OBSERVER respectively). If the RPC times out, we will ignore the error (since the leader could
1127
  // have successfully submitted the ChangeConfig request and failed to respond in time)
1128
  // and check the committed config until we find that this server's role has changed, or until we
1129
  // time out which will cause us to tombstone the tablet.
1130
1.95k
  TOMBSTONE_NOT_OK(rb_client->Finish(),
1131
1.95k
                   meta,
1132
1.95k
                   fs_manager_->uuid(),
1133
1.95k
                   "Remote bootstrap: Failed calling Finish()",
1134
1.95k
                   this);
1135
1136
1.95k
  LOG(INFO) << kLogPrefix << "Remote bootstrap: Opening tablet";
1137
1138
  // TODO(hector):  ENG-3173: We need to simulate a failure in OpenTablet during remote bootstrap
1139
  // and verify that this tablet server gets remote bootstrapped again by the leader. We also need
1140
  // to check what happens when this server receives raft consensus requests since at this point,
1141
  // this tablet server could be a voter (if the ChangeRole request in Finish succeeded and its
1142
  // initial role was PRE_VOTER).
1143
1.95k
  OpenTablet(meta, nullptr);
1144
  // If OpenTablet fails, tablet_peer->error() will be set.
1145
1.95k
  RETURN_NOT_OK(ShutdownAndTombstoneTabletPeerNotOk(
1146
1.95k
      tablet_peer->error(), tablet_peer, meta, fs_manager_->uuid(),
1147
1.95k
      "Remote bootstrap: OpenTablet() failed", this));
1148
1149
1.95k
  auto status = rb_client->VerifyChangeRoleSucceeded(tablet_peer->shared_consensus());
1150
1.95k
  if (!status.ok()) {
1151
    // If for some reason this tserver wasn't promoted (e.g. from PRE-VOTER to VOTER), the leader
1152
    // will find out and do the CHANGE_CONFIG.
1153
3
    LOG(WARNING) << kLogPrefix << "Remote bootstrap finished. "
1154
3
                               << "Failure calling VerifyChangeRoleSucceeded: "
1155
3
                               << status.ToString();
1156
1.95k
  } else {
1157
1.95k
    LOG(INFO) << kLogPrefix << "Remote bootstrap for tablet ended successfully";
1158
1.95k
  }
1159
1160
1.95k
  WARN_NOT_OK(rb_client->Remove(), "Remove remote bootstrap sessions failed");
1161
1162
1.95k
  return Status::OK();
1163
1.95k
}
1164
1165
// Create and register a new TabletPeer, given tablet metadata.
1166
Result<TabletPeerPtr> TSTabletManager::CreateAndRegisterTabletPeer(
1167
142k
    const RaftGroupMetadataPtr& meta, RegisterTabletPeerMode mode) {
1168
142k
  TabletPeerPtr tablet_peer(new tablet::TabletPeer(
1169
142k
      meta,
1170
142k
      local_peer_pb_,
1171
142k
      scoped_refptr<server::Clock>(server_->clock()),
1172
142k
      fs_manager_->uuid(),
1173
142k
      Bind(&TSTabletManager::ApplyChange, Unretained(this), meta->raft_group_id()),
1174
142k
      metric_registry_,
1175
142k
      this,
1176
142k
      async_client_init_->get_client_future()));
1177
142k
  RETURN_NOT_OK(RegisterTablet(meta->raft_group_id(), tablet_peer, mode));
1178
142k
  return tablet_peer;
1179
142k
}
1180
1181
Status TSTabletManager::DeleteTablet(
1182
    const string& tablet_id,
1183
    TabletDataState delete_type,
1184
    const boost::optional<int64_t>& cas_config_opid_index_less_or_equal,
1185
    bool hide_only,
1186
77.5k
    boost::optional<TabletServerErrorPB::Code>* error_code) {
1187
1188
77.5k
  if (delete_type != TABLET_DATA_DELETED && 
delete_type != TABLET_DATA_TOMBSTONED3.48k
) {
1189
0
    return STATUS(InvalidArgument, "DeleteTablet() requires an argument that is one of "
1190
0
                                   "TABLET_DATA_DELETED or TABLET_DATA_TOMBSTONED",
1191
0
                                   Substitute("Given: $0 ($1)",
1192
0
                                              TabletDataState_Name(delete_type), delete_type));
1193
0
  }
1194
1195
77.5k
  TRACE("Deleting tablet $0", tablet_id);
1196
1197
77.5k
  TabletPeerPtr tablet_peer;
1198
77.5k
  scoped_refptr<TransitionInProgressDeleter> deleter;
1199
77.5k
  {
1200
    // Acquire the lock in exclusive mode as we'll add a entry to the
1201
    // transition_in_progress_ map.
1202
77.5k
    std::lock_guard<RWMutex> lock(mutex_);
1203
77.5k
    TRACE("Acquired tablet manager lock");
1204
77.5k
    RETURN_NOT_OK(CheckRunningUnlocked(error_code));
1205
1206
77.5k
    if (!LookupTabletUnlocked(tablet_id, &tablet_peer)) {
1207
45
      *error_code = TabletServerErrorPB::TABLET_NOT_FOUND;
1208
45
      return STATUS(NotFound, "Tablet not found", tablet_id);
1209
45
    }
1210
    // Sanity check that the tablet's deletion isn't already in progress
1211
77.5k
    Status s = StartTabletStateTransition(tablet_id, "deleting tablet", &deleter);
1212
77.5k
    if (PREDICT_FALSE(!s.ok())) {
1213
1.87k
      *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING;
1214
1.87k
      return s;
1215
1.87k
    }
1216
77.5k
  }
1217
1218
  // If the tablet is already deleted, the CAS check isn't possible because
1219
  // consensus and therefore the log is not available.
1220
75.6k
  TabletDataState data_state = tablet_peer->tablet_metadata()->tablet_data_state();
1221
75.6k
  bool tablet_deleted = (data_state == TABLET_DATA_DELETED || data_state == TABLET_DATA_TOMBSTONED);
1222
1223
  // If a tablet peer is in the FAILED state, then we need to be able to tombstone or delete this
1224
  // tablet. If the tablet is tombstoned, then this TS can be remote bootstrapped with the same
1225
  // tablet.
1226
75.6k
  bool tablet_failed = tablet_peer->state() == RaftGroupStatePB::FAILED;
1227
1228
  // They specified an "atomic" delete. Check the committed config's opid_index.
1229
  // TODO: There's actually a race here between the check and shutdown, but
1230
  // it's tricky to fix. We could try checking again after the shutdown and
1231
  // restarting the tablet if the local replica committed a higher config
1232
  // change op during that time, or potentially something else more invasive.
1233
75.6k
  if (cas_config_opid_index_less_or_equal && 
!tablet_deleted1.65k
&&
!tablet_failed1.56k
) {
1234
1.56k
    shared_ptr<consensus::Consensus> consensus = tablet_peer->shared_consensus();
1235
1.56k
    if (!consensus) {
1236
0
      *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING;
1237
0
      return STATUS(IllegalState, "Consensus not available. Tablet shutting down");
1238
0
    }
1239
1.56k
    RaftConfigPB committed_config = consensus->CommittedConfig();
1240
1.56k
    if (committed_config.opid_index() > *cas_config_opid_index_less_or_equal) {
1241
79
      *error_code = TabletServerErrorPB::CAS_FAILED;
1242
79
      return STATUS(IllegalState, Substitute("Request specified cas_config_opid_index_less_or_equal"
1243
79
                                             " of $0 but the committed config has opid_index of $1",
1244
79
                                             *cas_config_opid_index_less_or_equal,
1245
79
                                             committed_config.opid_index()));
1246
79
    }
1247
1.56k
  }
1248
1249
75.5k
  RaftGroupMetadataPtr meta = tablet_peer->tablet_metadata();
1250
75.5k
  if (hide_only) {
1251
36
    meta->SetHidden(true);
1252
36
    return meta->Flush();
1253
36
  }
1254
  // No matter if the tablet was deleted (drop table), or tombstoned (potentially moved to a
1255
  // different TS), we do not need to flush rocksdb anymore, as this data is irrelevant.
1256
  //
1257
  // Note: This might change for PITR.
1258
75.5k
  bool delete_data = delete_type == TABLET_DATA_DELETED || 
delete_type == TABLET_DATA_TOMBSTONED1.58k
;
1259
75.5k
  RETURN_NOT_OK(tablet_peer->Shutdown(tablet::IsDropTable(delete_data)));
1260
1261
75.5k
  yb::OpId last_logged_opid = tablet_peer->GetLatestLogEntryOpId();
1262
1263
75.5k
  Status s = DeleteTabletData(meta,
1264
75.5k
                              delete_type,
1265
75.5k
                              fs_manager_->uuid(),
1266
75.5k
                              last_logged_opid,
1267
75.5k
                              this);
1268
75.5k
  if (PREDICT_FALSE(!s.ok())) {
1269
0
    s = s.CloneAndPrepend(Substitute("Unable to delete on-disk data from tablet $0",
1270
0
                                     tablet_id));
1271
0
    LOG(WARNING) << s.ToString();
1272
0
    tablet_peer->SetFailed(s);
1273
0
    return s;
1274
0
  }
1275
1276
75.5k
  tablet_peer->status_listener()->StatusMessage("Deleted tablet blocks from disk");
1277
1278
  // We only remove DELETED tablets from the tablet map.
1279
75.5k
  if (delete_type == TABLET_DATA_DELETED) {
1280
73.9k
    std::lock_guard<RWMutex> lock(mutex_);
1281
73.9k
    RETURN_NOT_OK(CheckRunningUnlocked(error_code));
1282
73.9k
    CHECK_EQ
(1, tablet_map_.erase(tablet_id)) << tablet_id0
;
1283
73.9k
    dirty_tablets_.erase(tablet_id);
1284
73.9k
  }
1285
1286
  // We unregister TOMBSTONED tablets in addition to DELETED tablets because they do not have
1287
  // any more data on disk, so we shouldn't count these tablets when load balancing the disks.
1288
75.5k
  UnregisterDataWalDir(meta->table_id(),
1289
75.5k
                       tablet_id,
1290
75.5k
                       meta->data_root_dir(),
1291
75.5k
                       meta->wal_root_dir());
1292
1293
75.5k
  return Status::OK();
1294
75.5k
}
1295
1296
Status TSTabletManager::CheckRunningUnlocked(
1297
151k
    boost::optional<TabletServerErrorPB::Code>* error_code) const {
1298
151k
  if (state_ == MANAGER_RUNNING) {
1299
151k
    return Status::OK();
1300
151k
  }
1301
0
  *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING;
1302
0
  return STATUS(ServiceUnavailable, Substitute("Tablet Manager is not running: $0",
1303
151k
                                               TSTabletManagerStatePB_Name(state_)));
1304
151k
}
1305
1306
// NO_THREAD_SAFETY_ANALYSIS because this analysis does not work with unique_lock.
1307
Status TSTabletManager::StartTabletStateTransition(
1308
    const string& tablet_id,
1309
    const string& reason,
1310
228k
    scoped_refptr<TransitionInProgressDeleter>* deleter) NO_THREAD_SAFETY_ANALYSIS {
1311
228k
  std::unique_lock<std::mutex> lock(transition_in_progress_mutex_);
1312
228k
  const auto emplace_result = transition_in_progress_.emplace(tablet_id, reason);
1313
228k
  if (!emplace_result.second) {
1314
10.3k
    return STATUS_FORMAT(
1315
10.3k
        AlreadyPresent, "State transition of tablet $0 already in progress: $1", tablet_id,
1316
10.3k
        *emplace_result.first);
1317
10.3k
  }
1318
217k
  deleter->reset(new TransitionInProgressDeleter(
1319
217k
      &transition_in_progress_, &transition_in_progress_mutex_, tablet_id));
1320
217k
  return Status::OK();
1321
228k
}
1322
1323
68
bool TSTabletManager::IsTabletInTransition(const TabletId& tablet_id) const {
1324
68
  std::unique_lock<std::mutex> lock(transition_in_progress_mutex_);
1325
68
  return ContainsKey(transition_in_progress_, tablet_id);
1326
68
}
1327
1328
Status TSTabletManager::OpenTabletMeta(const string& tablet_id,
1329
217
                                       RaftGroupMetadataPtr* metadata) {
1330
217
  LOG(INFO) << "Loading metadata for tablet " << tablet_id;
1331
217
  TRACE("Loading metadata...");
1332
217
  auto load_result = RaftGroupMetadata::Load(fs_manager_, tablet_id);
1333
217
  RETURN_NOT_OK_PREPEND(load_result,
1334
217
                        Format("Failed to load tablet metadata for tablet id $0", tablet_id));
1335
217
  TRACE("Metadata loaded");
1336
217
  metadata->swap(*load_result);
1337
217
  return Status::OK();
1338
217
}
1339
1340
void TSTabletManager::OpenTablet(const RaftGroupMetadataPtr& meta,
1341
142k
                                 const scoped_refptr<TransitionInProgressDeleter>& deleter) {
1342
142k
  string tablet_id = meta->raft_group_id();
1343
142k
  TRACE_EVENT1("tserver", "TSTabletManager::OpenTablet",
1344
142k
               "tablet_id", tablet_id);
1345
1346
142k
  TabletPeerPtr tablet_peer;
1347
18.4E
  CHECK(LookupTablet(tablet_id, &tablet_peer))
1348
18.4E
      << "Tablet not registered prior to OpenTabletAsync call: " << tablet_id;
1349
1350
142k
  tablet::TabletPtr tablet;
1351
142k
  scoped_refptr<Log> log;
1352
142k
  const string kLogPrefix = TabletLogPrefix(tablet_id);
1353
1354
142k
  LOG(INFO) << kLogPrefix << "Bootstrapping tablet";
1355
142k
  TRACE("Bootstrapping tablet");
1356
1357
142k
  consensus::ConsensusBootstrapInfo bootstrap_info;
1358
142k
  consensus::RetryableRequests retryable_requests(kLogPrefix);
1359
142k
  yb::OpId split_op_id;
1360
1361
142k
  LOG_TIMING_PREFIX(INFO, kLogPrefix, "bootstrapping tablet") {
1362
    // Read flag before CAS to avoid TSAN race conflict with GetAllFlags.
1363
142k
    if (GetAtomicFlag(&FLAGS_TEST_force_single_tablet_failure) &&
1364
142k
        CompareAndSetFlag(&FLAGS_TEST_force_single_tablet_failure,
1365
3
                          true /* expected */, false /* val */)) {
1366
3
      LOG(ERROR) << "Setting the state of a tablet to FAILED";
1367
3
      tablet_peer->SetFailed(STATUS(InternalError, "Setting tablet to failed state for test",
1368
3
                                    tablet_id));
1369
3
      return;
1370
3
    }
1371
1372
    // TODO: handle crash mid-creation of tablet? do we ever end up with a
1373
    // partially created tablet here?
1374
142k
    auto s = tablet_peer->SetBootstrapping();
1375
142k
    if (!s.ok()) {
1376
0
      LOG(ERROR) << kLogPrefix << "Tablet failed to set bootstrapping: " << s;
1377
0
      tablet_peer->SetFailed(s);
1378
0
      return;
1379
0
    }
1380
1381
142k
    tablet::TabletInitData tablet_init_data = {
1382
142k
      .metadata = meta,
1383
142k
      .client_future = async_client_init_->get_client_future(),
1384
142k
      .clock = scoped_refptr<server::Clock>(server_->clock()),
1385
142k
      .parent_mem_tracker = MemTracker::FindOrCreateTracker("Tablets", server_->mem_tracker()),
1386
142k
      .block_based_table_mem_tracker = mem_manager_->block_based_table_mem_tracker(),
1387
142k
      .metric_registry = metric_registry_,
1388
142k
      .log_anchor_registry = tablet_peer->log_anchor_registry(),
1389
142k
      .tablet_options = tablet_options_,
1390
142k
      .log_prefix_suffix = " P " + tablet_peer->permanent_uuid(),
1391
142k
      .transaction_participant_context = tablet_peer.get(),
1392
142k
      .local_tablet_filter = std::bind(&TSTabletManager::PreserveLocalLeadersOnly, this, _1),
1393
142k
      .transaction_coordinator_context = tablet_peer.get(),
1394
142k
      .txns_enabled = tablet::TransactionsEnabled::kTrue,
1395
      // We are assuming we're never dealing with the system catalog tablet in TSTabletManager.
1396
142k
      .is_sys_catalog = tablet::IsSysCatalogTablet::kFalse,
1397
142k
      .snapshot_coordinator = nullptr,
1398
142k
      .tablet_splitter = this,
1399
142k
      .allowed_history_cutoff_provider = std::bind(
1400
142k
          &TSTabletManager::AllowedHistoryCutoff, this, _1),
1401
142k
    };
1402
142k
    tablet::BootstrapTabletData data = {
1403
142k
      .tablet_init_data = tablet_init_data,
1404
142k
      .listener = tablet_peer->status_listener(),
1405
142k
      .append_pool = append_pool(),
1406
142k
      .allocation_pool = allocation_pool_.get(),
1407
142k
      .retryable_requests = &retryable_requests,
1408
142k
    };
1409
142k
    s = BootstrapTablet(data, &tablet, &log, &bootstrap_info);
1410
142k
    if (!s.ok()) {
1411
13
      LOG(ERROR) << kLogPrefix << "Tablet failed to bootstrap: " << s;
1412
13
      tablet_peer->SetFailed(s);
1413
13
      return;
1414
13
    }
1415
142k
  }
1416
1417
142k
  MonoTime start(MonoTime::Now());
1418
142k
  LOG_TIMING_PREFIX(INFO, kLogPrefix, "starting tablet") {
1419
142k
    TRACE("Initializing tablet peer");
1420
142k
    auto s = tablet_peer->InitTabletPeer(
1421
142k
        tablet,
1422
142k
        server_->mem_tracker(),
1423
142k
        server_->messenger(),
1424
142k
        &server_->proxy_cache(),
1425
142k
        log,
1426
142k
        tablet->GetTableMetricsEntity(),
1427
142k
        tablet->GetTabletMetricsEntity(),
1428
142k
        raft_pool(),
1429
142k
        tablet_prepare_pool(),
1430
142k
        &retryable_requests,
1431
142k
        multi_raft_manager_.get());
1432
1433
142k
    if (!s.ok()) {
1434
2
      LOG(ERROR) << kLogPrefix << "Tablet failed to init: "
1435
2
                 << s.ToString();
1436
2
      tablet_peer->SetFailed(s);
1437
2
      return;
1438
2
    }
1439
1440
142k
    TRACE("Starting tablet peer");
1441
142k
    s = tablet_peer->Start(bootstrap_info);
1442
142k
    if (!s.ok()) {
1443
0
      LOG(ERROR) << kLogPrefix << "Tablet failed to start: "
1444
0
                 << s.ToString();
1445
0
      tablet_peer->SetFailed(s);
1446
0
      return;
1447
0
    }
1448
1449
142k
    tablet_peer->RegisterMaintenanceOps(server_->maintenance_manager());
1450
142k
  }
1451
1452
142k
  auto elapsed_ms = MonoTime::Now().GetDeltaSince(start).ToMilliseconds();
1453
142k
  if (elapsed_ms > FLAGS_tablet_start_warn_threshold_ms) {
1454
112
    LOG(WARNING) << kLogPrefix << "Tablet startup took " << elapsed_ms << "ms";
1455
112
    if (Trace::CurrentTrace()) {
1456
0
      LOG(WARNING) << kLogPrefix << "Trace:" << std::endl
1457
0
                   << Trace::CurrentTrace()->DumpToString(true);
1458
0
    }
1459
112
  }
1460
1461
142k
  if (PREDICT_TRUE(!FLAGS_TEST_skip_post_split_compaction)) {
1462
142k
    WARN_NOT_OK(
1463
142k
    tablet->TriggerPostSplitCompactionIfNeeded([&]() {
1464
142k
      return post_split_trigger_compaction_pool_->NewToken(ThreadPool::ExecutionMode::SERIAL);
1465
142k
    }),
1466
142k
    "Failed to submit compaction for post-split tablet.");
1467
142k
  } else {
1468
3
    LOG(INFO) << "Skipping post split compaction " << meta->raft_group_id();
1469
3
  }
1470
1471
142k
  if (tablet->ShouldDisableLbMove()) {
1472
134
    std::lock_guard<RWMutex> lock(mutex_);
1473
134
    tablets_blocked_from_lb_.insert(tablet->tablet_id());
1474
134
    VLOG(2) << TabletLogPrefix(tablet->tablet_id())
1475
0
            << " marking as maybe being compacted after split.";
1476
134
  }
1477
142k
}
1478
1479
0
Status TSTabletManager::TriggerCompactionAndWait(const TabletPtrs& tablets) {
1480
0
  CountDownLatch latch(tablets.size());
1481
0
  auto token = admin_triggered_compaction_pool_->NewToken(ThreadPool::ExecutionMode::CONCURRENT);
1482
0
  for (auto tablet : tablets) {
1483
0
    RETURN_NOT_OK(token->SubmitFunc([&latch, tablet]() {
1484
0
      WARN_NOT_OK(tablet->ForceFullRocksDBCompact(), "Failed to submit compaction for tablet.");
1485
0
      latch.CountDown();
1486
0
    }));
1487
0
  }
1488
0
  latch.Wait();
1489
0
  return Status::OK();
1490
0
}
1491
1492
193
void TSTabletManager::StartShutdown() {
1493
193
  {
1494
193
    std::lock_guard<RWMutex> lock(mutex_);
1495
193
    switch (state_) {
1496
0
      case MANAGER_QUIESCING: {
1497
0
        VLOG(1) << "Tablet manager shut down already in progress..";
1498
0
        return;
1499
0
      }
1500
0
      case MANAGER_SHUTDOWN: {
1501
0
        VLOG(1) << "Tablet manager has already been shut down.";
1502
0
        return;
1503
0
      }
1504
0
      case MANAGER_INITIALIZING:
1505
193
      case MANAGER_RUNNING: {
1506
193
        LOG_WITH_PREFIX(INFO) << "Shutting down tablet manager...";
1507
193
        state_ = MANAGER_QUIESCING;
1508
193
        break;
1509
0
      }
1510
0
      default: {
1511
0
        LOG(FATAL) << "Invalid state: " << TSTabletManagerStatePB_Name(state_);
1512
0
      }
1513
193
    }
1514
193
  }
1515
1516
193
  tablets_cleaner_->Shutdown();
1517
1518
193
  verify_tablet_data_poller_->Shutdown();
1519
1520
193
  metrics_cleaner_->Shutdown();
1521
1522
193
  async_client_init_->Shutdown();
1523
1524
193
  mem_manager_->Shutdown();
1525
1526
  // Wait for all RBS operations to finish.
1527
193
  const MonoDelta kSingleWait = 10ms;
1528
193
  const MonoDelta kReportInterval = 5s;
1529
193
  const MonoDelta kMaxWait = 30s;
1530
193
  MonoDelta waited = MonoDelta::kZero;
1531
193
  MonoDelta next_report_time = kReportInterval;
1532
193
  while (int remaining_rbs = num_tablets_being_remote_bootstrapped_ > 0) {
1533
0
    if (waited >= next_report_time) {
1534
0
      if (waited >= kMaxWait) {
1535
0
        std::string addr = "";
1536
0
        for (auto iter = bootstrap_source_addresses_.begin();
1537
0
             iter != bootstrap_source_addresses_.end();
1538
0
             iter++) {
1539
0
          if (iter == bootstrap_source_addresses_.begin()) {
1540
0
            addr += *iter;
1541
0
          } else {
1542
0
            addr += "," + *iter;
1543
0
          }
1544
0
        }
1545
0
        LOG_WITH_PREFIX(DFATAL)
1546
0
            << "Waited for " << waited << "ms. Still had "
1547
0
            << remaining_rbs << " pending remote bootstraps: " + addr;
1548
0
      } else {
1549
0
        LOG_WITH_PREFIX(WARNING)
1550
0
            << "Still waiting for " << remaining_rbs
1551
0
            << " ongoing RemoteBootstraps to finish after " << waited;
1552
0
      }
1553
0
      next_report_time = std::min(kMaxWait, waited + kReportInterval);
1554
0
    }
1555
0
    SleepFor(kSingleWait);
1556
0
    waited += kSingleWait;
1557
0
  }
1558
1559
  // Shut down the bootstrap pool, so new tablets are registered after this point.
1560
193
  open_tablet_pool_->Shutdown();
1561
1562
  // Take a snapshot of the peers list -- that way we don't have to hold
1563
  // on to the lock while shutting them down, which might cause a lock
1564
  // inversion. (see KUDU-308 for example).
1565
193
  for (const TabletPeerPtr& peer : GetTabletPeers()) {
1566
98
    if (peer->StartShutdown()) {
1567
97
      shutting_down_peers_.push_back(peer);
1568
97
    }
1569
98
  }
1570
193
}
1571
1572
89
void TSTabletManager::CompleteShutdown() {
1573
97
  for (const TabletPeerPtr& peer : shutting_down_peers_) {
1574
97
    peer->CompleteShutdown();
1575
97
  }
1576
1577
  // Shut down the apply pool.
1578
89
  apply_pool_->Shutdown();
1579
1580
89
  if (raft_pool_) {
1581
89
    raft_pool_->Shutdown();
1582
89
  }
1583
89
  if (tablet_prepare_pool_) {
1584
89
    tablet_prepare_pool_->Shutdown();
1585
89
  }
1586
89
  if (append_pool_) {
1587
89
    append_pool_->Shutdown();
1588
89
  }
1589
89
  if (post_split_trigger_compaction_pool_) {
1590
89
    post_split_trigger_compaction_pool_->Shutdown();
1591
89
  }
1592
89
  if (admin_triggered_compaction_pool_) {
1593
89
    admin_triggered_compaction_pool_->Shutdown();
1594
89
  }
1595
1596
89
  {
1597
89
    std::lock_guard<RWMutex> l(mutex_);
1598
89
    tablet_map_.clear();
1599
89
    dirty_tablets_.clear();
1600
1601
89
    std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_);
1602
89
    table_data_assignment_map_.clear();
1603
89
    table_wal_assignment_map_.clear();
1604
1605
89
    state_ = MANAGER_SHUTDOWN;
1606
89
  }
1607
89
}
1608
1609
177k
std::string TSTabletManager::LogPrefix() const {
1610
177k
  return "P " + fs_manager_->uuid() + ": ";
1611
177k
}
1612
1613
292k
std::string TSTabletManager::TabletLogPrefix(const TabletId& tablet_id) const {
1614
292k
  return tserver::LogPrefix(tablet_id, fs_manager_->uuid());
1615
292k
}
1616
1617
152k
bool TSTabletManager::ClosingUnlocked() const {
1618
152k
  return state_ == MANAGER_QUIESCING || state_ == MANAGER_SHUTDOWN;
1619
152k
}
1620
1621
Status TSTabletManager::RegisterTablet(const TabletId& tablet_id,
1622
                                       const TabletPeerPtr& tablet_peer,
1623
141k
                                       RegisterTabletPeerMode mode) {
1624
141k
  std::lock_guard<RWMutex> lock(mutex_);
1625
141k
  if (ClosingUnlocked()) {
1626
0
    auto result = STATUS_FORMAT(
1627
0
        ShutdownInProgress, "Unable to register tablet peer: $0: closing", tablet_id);
1628
0
    LOG(WARNING) << result;
1629
0
    return result;
1630
0
  }
1631
1632
  // If we are replacing a tablet peer, we delete the existing one first.
1633
141k
  if (mode == REPLACEMENT_PEER && 
tablet_map_.erase(tablet_id) != 1152
) {
1634
0
    auto result = STATUS_FORMAT(
1635
0
        NotFound, "Unable to remove previous tablet peer $0: not registered", tablet_id);
1636
0
    LOG(WARNING) << result;
1637
0
    return result;
1638
0
  }
1639
141k
  if (!InsertIfNotPresent(&tablet_map_, tablet_id, tablet_peer)) {
1640
0
    auto result = STATUS_FORMAT(
1641
0
        AlreadyPresent, "Unable to register tablet peer $0: already registered", tablet_id);
1642
0
    LOG(WARNING) << result;
1643
0
    return result;
1644
0
  }
1645
1646
141k
  LOG_WITH_PREFIX(INFO) << "Registered tablet " << tablet_id;
1647
1648
141k
  return Status::OK();
1649
141k
}
1650
1651
bool TSTabletManager::LookupTablet(const string& tablet_id,
1652
38.0M
                                   TabletPeerPtr* tablet_peer) const {
1653
38.0M
  SharedLock<RWMutex> shared_lock(mutex_);
1654
38.0M
  return LookupTabletUnlocked(tablet_id, tablet_peer);
1655
38.0M
}
1656
1657
Result<std::shared_ptr<tablet::TabletPeer>> TSTabletManager::LookupTablet(
1658
68
    const TabletId& tablet_id) const {
1659
68
  TabletPeerPtr tablet_peer;
1660
68
  SCHECK(LookupTablet(tablet_id, &tablet_peer), NotFound, Format("Tablet $0 not found", tablet_id));
1661
68
  return tablet_peer;
1662
68
}
1663
1664
bool TSTabletManager::LookupTabletUnlocked(const string& tablet_id,
1665
38.3M
                                           TabletPeerPtr* tablet_peer) const {
1666
38.3M
  const TabletPeerPtr* found = FindOrNull(tablet_map_, tablet_id);
1667
38.3M
  if (!found) {
1668
145k
    return false;
1669
145k
  }
1670
38.1M
  *tablet_peer = *found;
1671
38.1M
  return true;
1672
38.3M
}
1673
1674
Status TSTabletManager::GetTabletPeer(const string& tablet_id,
1675
37.8M
                                      TabletPeerPtr* tablet_peer) const {
1676
37.8M
  if (!LookupTablet(tablet_id, tablet_peer)) {
1677
3.32k
    return STATUS(NotFound, "Tablet not found", tablet_id);
1678
3.32k
  }
1679
37.8M
  TabletDataState data_state = (*tablet_peer)->tablet_metadata()->tablet_data_state();
1680
37.8M
  if (!CanServeTabletData(data_state)) {
1681
9.06k
    return STATUS(
1682
9.06k
        IllegalState, "Tablet data state not ready: " + TabletDataState_Name(data_state),
1683
9.06k
        tablet_id);
1684
9.06k
  }
1685
37.8M
  return Status::OK();
1686
37.8M
}
1687
1688
24.5M
const NodeInstancePB& TSTabletManager::NodeInstance() const {
1689
24.5M
  return server_->instance_pb();
1690
24.5M
}
1691
1692
1
Status TSTabletManager::GetRegistration(ServerRegistrationPB* reg) const {
1693
1
  return server_->GetRegistration(reg, server::RpcOnly::kTrue);
1694
1
}
1695
1696
1.42M
TSTabletManager::TabletPeers TSTabletManager::GetTabletPeers(TabletPtrs* tablet_ptrs) const {
1697
1.42M
  SharedLock<RWMutex> shared_lock(mutex_);
1698
1.42M
  TabletPeers peers;
1699
1.42M
  GetTabletPeersUnlocked(&peers);
1700
1.42M
  if (tablet_ptrs) {
1701
8
    for (const auto& peer : peers) {
1702
8
      if (!peer) 
continue0
;
1703
8
      auto tablet_ptr = peer->shared_tablet();
1704
8
      if (tablet_ptr) {
1705
8
        tablet_ptrs->push_back(tablet_ptr);
1706
8
      }
1707
8
    }
1708
8
  }
1709
1.42M
  return peers;
1710
1.42M
}
1711
1712
1.43M
void TSTabletManager::GetTabletPeersUnlocked(TabletPeers* tablet_peers) const {
1713
1.43M
  DCHECK(tablet_peers != nullptr);
1714
  // See AppendKeysFromMap for why this is done.
1715
1.43M
  if (tablet_peers->empty()) {
1716
1.43M
    tablet_peers->reserve(tablet_map_.size());
1717
1.43M
  }
1718
4.12M
  for (const auto& entry : tablet_map_) {
1719
4.12M
    if (entry.second != nullptr) {
1720
4.12M
      tablet_peers->push_back(entry.second);
1721
4.12M
    }
1722
4.12M
  }
1723
1.43M
}
1724
1725
411k
void TSTabletManager::PreserveLocalLeadersOnly(std::vector<const TabletId*>* tablet_ids) const {
1726
411k
  SharedLock<decltype(mutex_)> shared_lock(mutex_);
1727
8.48M
  auto filter = [this](const TabletId* id) REQUIRES_SHARED(mutex_) {
1728
8.48M
    auto it = tablet_map_.find(*id);
1729
8.48M
    if (it == tablet_map_.end()) {
1730
722
      return true;
1731
722
    }
1732
8.48M
    auto leader_status = it->second->LeaderStatus();
1733
8.48M
    return leader_status != consensus::LeaderStatus::LEADER_AND_READY;
1734
8.48M
  };
1735
411k
  tablet_ids->erase(std::remove_if(tablet_ids->begin(), tablet_ids->end(), filter),
1736
411k
                    tablet_ids->end());
1737
411k
}
1738
1739
void TSTabletManager::ApplyChange(const string& tablet_id,
1740
592k
                                  shared_ptr<consensus::StateChangeContext> context) {
1741
592k
  WARN_NOT_OK(
1742
592k
      apply_pool_->SubmitFunc(
1743
592k
          std::bind(&TSTabletManager::MarkTabletDirty, this, tablet_id, context)),
1744
592k
      "Unable to run MarkDirty callback")
1745
592k
}
1746
1747
void TSTabletManager::MarkTabletDirty(const TabletId& tablet_id,
1748
592k
                                      std::shared_ptr<consensus::StateChangeContext> context) {
1749
592k
  std::lock_guard<RWMutex> lock(mutex_);
1750
592k
  MarkDirtyUnlocked(tablet_id, context);
1751
592k
}
1752
1753
void TSTabletManager::MarkTabletBeingRemoteBootstrapped(
1754
1.96k
    const TabletId& tablet_id, const TableId& table_id) {
1755
1.96k
  std::lock_guard<RWMutex> lock(mutex_);
1756
1.96k
  tablets_being_remote_bootstrapped_.insert(tablet_id);
1757
1.96k
  tablets_being_remote_bootstrapped_per_table_[table_id].insert(tablet_id);
1758
1.96k
  MaybeDoChecksForTests(table_id);
1759
1.96k
  LOG(INFO) << "Concurrent remote bootstrap sessions: "
1760
1.96k
            << tablets_being_remote_bootstrapped_.size()
1761
1.96k
            << "Concurrent remote bootstrap sessions for table " << table_id
1762
1.96k
            << ": " << tablets_being_remote_bootstrapped_per_table_[table_id].size();
1763
1.96k
}
1764
1765
void TSTabletManager::UnmarkTabletBeingRemoteBootstrapped(
1766
1.94k
    const TabletId& tablet_id, const TableId& table_id) {
1767
1.94k
  std::lock_guard<RWMutex> lock(mutex_);
1768
1.94k
  tablets_being_remote_bootstrapped_.erase(tablet_id);
1769
1.94k
  tablets_being_remote_bootstrapped_per_table_[table_id].erase(tablet_id);
1770
1.94k
}
1771
1772
0
size_t TSTabletManager::TEST_GetNumDirtyTablets() const {
1773
0
  SharedLock<RWMutex> lock(mutex_);
1774
0
  return dirty_tablets_.size();
1775
0
}
1776
1777
Status TSTabletManager::GetNumTabletsPendingBootstrap(
1778
9
    IsTabletServerReadyResponsePB* resp) const {
1779
9
  if (state() != MANAGER_RUNNING) {
1780
0
    resp->set_num_tablets_not_running(INT_MAX);
1781
0
    resp->set_total_tablets(INT_MAX);
1782
0
    return Status::OK();
1783
0
  }
1784
1785
9
  SharedLock<RWMutex> shared_lock(mutex_);
1786
9
  int num_pending = 0;
1787
9
  int total_tablets = 0;
1788
9
  for (const auto& entry : tablet_map_) {
1789
0
    RaftGroupStatePB state = entry.second->state();
1790
0
    TabletDataState data_state = entry.second->data_state();
1791
    // Do not count tablets that will never get to RUNNING state.
1792
0
    if (!CanServeTabletData(data_state)) {
1793
0
      continue;
1794
0
    }
1795
0
    bool not_started_or_bootstrap = state == NOT_STARTED || state == BOOTSTRAPPING;
1796
0
    if (not_started_or_bootstrap || state == RUNNING) {
1797
0
      total_tablets++;
1798
0
    }
1799
0
    if (not_started_or_bootstrap) {
1800
0
      num_pending++;
1801
0
    }
1802
0
  }
1803
1804
9
  LOG(INFO) << num_pending << " tablets pending bootstrap out of " << total_tablets;
1805
9
  resp->set_num_tablets_not_running(num_pending);
1806
9
  resp->set_total_tablets(total_tablets);
1807
1808
9
  return Status::OK();
1809
9
}
1810
1811
5.25M
int TSTabletManager::GetNumLiveTablets() const {
1812
5.25M
  int count = 0;
1813
5.25M
  SharedLock<RWMutex> lock(mutex_);
1814
24.9M
  for (const auto& entry : tablet_map_) {
1815
24.9M
    RaftGroupStatePB state = entry.second->state();
1816
24.9M
    if (state == BOOTSTRAPPING ||
1817
24.9M
        
state == RUNNING24.5M
) {
1818
23.9M
      count++;
1819
23.9M
    }
1820
24.9M
  }
1821
5.25M
  return count;
1822
5.25M
}
1823
1824
5.25M
int TSTabletManager::GetLeaderCount() const {
1825
5.25M
  int count = 0;
1826
5.25M
  SharedLock<RWMutex> lock(mutex_);
1827
24.9M
  for (const auto& entry : tablet_map_) {
1828
24.9M
    consensus::LeaderStatus leader_status = entry.second->LeaderStatus(/* allow_stale =*/ true);
1829
24.9M
    if (leader_status != consensus::LeaderStatus::NOT_LEADER) {
1830
7.09M
      count++;
1831
7.09M
    }
1832
24.9M
  }
1833
5.25M
  return count;
1834
5.25M
}
1835
1836
void TSTabletManager::MarkDirtyUnlocked(const TabletId& tablet_id,
1837
592k
                                        std::shared_ptr<consensus::StateChangeContext> context) {
1838
592k
  TabletReportState* state = FindOrNull(dirty_tablets_, tablet_id);
1839
592k
  if (state != nullptr) {
1840
250k
    CHECK_GE(next_report_seq_, state->change_seq);
1841
250k
    state->change_seq = next_report_seq_;
1842
342k
  } else {
1843
342k
    TabletReportState state;
1844
342k
    state.change_seq = next_report_seq_;
1845
342k
    InsertOrDie(&dirty_tablets_, tablet_id, state);
1846
342k
  }
1847
592k
  VLOG(2) << TabletLogPrefix(tablet_id)
1848
0
          << "Marking dirty. Reason: " << AsString(context)
1849
0
          << ". Will report this tablet to the Master in the next heartbeat "
1850
0
          << "as part of report #" << next_report_seq_;
1851
592k
  server_->heartbeater()->TriggerASAP();
1852
592k
}
1853
1854
8.74k
void TSTabletManager::InitLocalRaftPeerPB() {
1855
8.74k
  DCHECK_EQ(state(), MANAGER_INITIALIZING);
1856
8.74k
  local_peer_pb_.set_permanent_uuid(fs_manager_->uuid());
1857
8.74k
  ServerRegistrationPB reg;
1858
8.74k
  CHECK_OK(server_->GetRegistration(&reg, server::RpcOnly::kTrue));
1859
8.74k
  TakeRegistration(&reg, &local_peer_pb_);
1860
8.74k
}
1861
1862
void TSTabletManager::CreateReportedTabletPB(const TabletPeerPtr& tablet_peer,
1863
458k
                                             ReportedTabletPB* reported_tablet) {
1864
458k
  reported_tablet->set_tablet_id(tablet_peer->tablet_id());
1865
458k
  reported_tablet->set_state(tablet_peer->state());
1866
458k
  reported_tablet->set_tablet_data_state(tablet_peer->tablet_metadata()->tablet_data_state());
1867
458k
  if (tablet_peer->state() == tablet::FAILED) {
1868
0
    AppStatusPB* error_status = reported_tablet->mutable_error();
1869
0
    StatusToPB(tablet_peer->error(), error_status);
1870
0
  }
1871
458k
  reported_tablet->set_schema_version(tablet_peer->tablet_metadata()->schema_version());
1872
1873
458k
  {
1874
458k
    auto tablet_ptr = tablet_peer->shared_tablet();
1875
458k
    if (tablet_ptr != nullptr) {
1876
454k
      reported_tablet->set_should_disable_lb_move(tablet_ptr->ShouldDisableLbMove());
1877
454k
    }
1878
458k
  }
1879
458k
  reported_tablet->set_fs_data_dir(tablet_peer->tablet_metadata()->data_root_dir());
1880
1881
  // We cannot get consensus state information unless the TabletPeer is running.
1882
458k
  shared_ptr<consensus::Consensus> consensus = tablet_peer->shared_consensus();
1883
458k
  if (consensus) {
1884
454k
    *reported_tablet->mutable_committed_consensus_state() =
1885
454k
        consensus->ConsensusState(consensus::CONSENSUS_CONFIG_COMMITTED);
1886
454k
  }
1887
1888
  // Set the hide status of the tablet.
1889
458k
  reported_tablet->set_is_hidden(tablet_peer->tablet_metadata()->hidden());
1890
458k
}
1891
1892
5.24M
void TSTabletManager::GenerateTabletReport(TabletReportPB* report, bool include_bootstrap) {
1893
5.24M
  report->Clear();
1894
  // Creating the tablet report can be slow in the case that it is in the
1895
  // middle of flushing its consensus metadata. We don't want to hold
1896
  // lock_ for too long, even in read mode, since it can cause other readers
1897
  // to block if there is a waiting writer (see KUDU-2193). So, we just make
1898
  // a local copy of the set of replicas.
1899
5.24M
  vector<std::shared_ptr<TabletPeer>> to_report;
1900
5.24M
  TabletIdSet tablet_ids;
1901
5.24M
  size_t dirty_count, report_limit;
1902
5.24M
  {
1903
5.24M
    std::lock_guard<RWMutex> write_lock(mutex_);
1904
5.24M
    uint32_t cur_report_seq = next_report_seq_++;
1905
5.24M
    report->set_sequence_number(cur_report_seq);
1906
1907
5.24M
    TabletIdSet::iterator i = tablets_blocked_from_lb_.begin();
1908
5.24M
    while (i != tablets_blocked_from_lb_.end()) {
1909
189
      TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, *i);
1910
189
      if (tablet_peer) {
1911
185
          const auto tablet = (*tablet_peer)->shared_tablet();
1912
          // If tablet is null, one of two things may be true:
1913
          // 1. TabletPeer::InitTabletPeer was not called yet
1914
          //
1915
          // Skip and keep tablet in tablets_blocked_from_lb_ till call InitTabletPeer.
1916
          //
1917
          // 2. TabletPeer::CompleteShutdown was called
1918
          //
1919
          // Tablet will be removed from tablets_blocked_from_lb_ with next GenerateTabletReport
1920
          // since tablet_peer will be removed from tablet_map_
1921
185
          if (tablet == nullptr) {
1922
0
            ++i;
1923
0
            continue;
1924
0
          }
1925
185
          const std::string& tablet_id = tablet->tablet_id();
1926
185
          if (!tablet->ShouldDisableLbMove()) {
1927
122
            i = tablets_blocked_from_lb_.erase(i);
1928
122
            VLOG
(1) << "Tablet " << tablet_id << " is no longer blocked from load-balancing."0
;
1929
122
            InsertOrUpdate(&dirty_tablets_, tablet_id, TabletReportState{cur_report_seq});
1930
122
          } else {
1931
63
            ++i;
1932
63
          }
1933
185
      } else {
1934
4
          VLOG(1) << "Tablet " << *i
1935
0
                  << " was marked as blocked from load balancing but was not found";
1936
4
          i = tablets_blocked_from_lb_.erase(i);
1937
4
      }
1938
189
    }
1939
1940
5.24M
    if (include_bootstrap) {
1941
5.24M
      for (auto const& tablet_id : tablets_being_remote_bootstrapped_) {
1942
18.3k
        VLOG
(1) << "Tablet " << tablet_id << " being remote bootstrapped and marked for report"0
;
1943
18.3k
        InsertOrUpdate(&dirty_tablets_, tablet_id, TabletReportState{cur_report_seq});
1944
18.3k
      }
1945
5.24M
    }
1946
5.24M
    for (const DirtyMap::value_type& dirty_entry : dirty_tablets_) {
1947
455k
      const TabletId& tablet_id = dirty_entry.first;
1948
455k
      tablet_ids.insert(tablet_id);
1949
455k
    }
1950
1951
5.24M
    for (auto const& tablet_id : tablet_ids) {
1952
455k
      TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, tablet_id);
1953
455k
      if (tablet_peer) {
1954
        // Dirty entry, report on it.
1955
455k
        to_report.push_back(*tablet_peer);
1956
455k
      } else {
1957
        // Tell the Master that this tablet was removed from the TServer side.
1958
0
        report->add_removed_tablet_ids(tablet_id);
1959
        // Don't count this as a 'dirty_tablet_' because the Master may not have it either.
1960
0
        dirty_tablets_.erase(tablet_id);
1961
0
      }
1962
455k
    }
1963
5.24M
    dirty_count = dirty_tablets_.size();
1964
5.24M
    report_limit = report_limit_;
1965
5.24M
  }
1966
5.24M
  for (const auto& replica : to_report) {
1967
455k
    CreateReportedTabletPB(replica, report->add_updated_tablets());
1968
    // Enforce a max tablet limit on reported tablets.
1969
455k
    if (implicit_cast<size_t>(report->updated_tablets_size()) >= report_limit) 
break0
;
1970
455k
  }
1971
5.24M
  report->set_remaining_tablet_count(
1972
5.24M
      narrow_cast<int>(dirty_count - report->updated_tablets_size()));
1973
5.24M
}
1974
1975
8.22k
void TSTabletManager::StartFullTabletReport(TabletReportPB* report) {
1976
8.22k
  report->Clear();
1977
  // Creating the tablet report can be slow in the case that it is in the
1978
  // middle of flushing its consensus metadata. We don't want to hold
1979
  // lock_ for too long, even in read mode, since it can cause other readers
1980
  // to block if there is a waiting writer (see KUDU-2193). So, we just make
1981
  // a local copy of the set of replicas.
1982
8.22k
  vector<std::shared_ptr<TabletPeer>> to_report;
1983
8.22k
  size_t dirty_count, report_limit;
1984
8.22k
  {
1985
8.22k
    std::lock_guard<RWMutex> write_lock(mutex_);
1986
8.22k
    uint32_t cur_report_seq = next_report_seq_++;
1987
8.22k
    report->set_sequence_number(cur_report_seq);
1988
8.22k
    GetTabletPeersUnlocked(&to_report);
1989
    // Mark all tablets as dirty, to be cleaned when reading the heartbeat response.
1990
8.22k
    for (const auto& peer : to_report) {
1991
2.76k
      InsertOrUpdate(&dirty_tablets_, peer->tablet_id(), TabletReportState{cur_report_seq});
1992
2.76k
    }
1993
8.22k
    dirty_count = dirty_tablets_.size();
1994
8.22k
    report_limit = report_limit_;
1995
8.22k
  }
1996
8.22k
  for (const auto& replica : to_report) {
1997
2.76k
    CreateReportedTabletPB(replica, report->add_updated_tablets());
1998
    // Enforce a max tablet limit on reported tablets.
1999
2.76k
    if (implicit_cast<size_t>(report->updated_tablets_size()) >= report_limit) 
break0
;
2000
2.76k
  }
2001
8.22k
  report->set_remaining_tablet_count(
2002
8.22k
      narrow_cast<int32_t>(dirty_count - report->updated_tablets_size()));
2003
8.22k
}
2004
2005
void TSTabletManager::MarkTabletReportAcknowledged(uint32_t acked_seq,
2006
                                                   const TabletReportUpdatesPB& updates,
2007
4.82M
                                                   bool dirty_check) {
2008
4.82M
  std::lock_guard<RWMutex> l(mutex_);
2009
2010
4.82M
  CHECK_LT(acked_seq, next_report_seq_);
2011
2012
  // Clear the "dirty" state for any tablets processed in this report.
2013
4.82M
  for (auto const & tablet : updates.tablets()) {
2014
454k
    auto it = dirty_tablets_.find(tablet.tablet_id());
2015
454k
    if (it != dirty_tablets_.end()) {
2016
454k
      const TabletReportState& state = it->second;
2017
454k
      if (state.change_seq <= acked_seq) {
2018
        // This entry has not changed since this tablet report, we no longer need to track it
2019
        // as dirty. Next modification will be re-added with a higher sequence number.
2020
355k
        dirty_tablets_.erase(it);
2021
355k
      }
2022
454k
    }
2023
454k
  }
2024
4.82M
#ifndef NDEBUG
2025
  // Verify dirty_tablets_ always processes all tablet changes.
2026
4.82M
  if (dirty_check) {
2027
4.82M
    for (auto const & d : dirty_tablets_) {
2028
269k
      if (d.second.change_seq <= acked_seq) {
2029
0
        LOG(DFATAL) << "Dirty Tablet should have been reported but wasn't: "
2030
0
                    << d.first << "@" << d.second.change_seq << " <= " << acked_seq;
2031
0
      }
2032
269k
    }
2033
4.82M
  }
2034
4.82M
#endif
2035
4.82M
}
2036
2037
Status TSTabletManager::HandleNonReadyTabletOnStartup(
2038
8
    const RaftGroupMetadataPtr& meta) {
2039
8
  const string& tablet_id = meta->raft_group_id();
2040
8
  TabletDataState data_state = meta->tablet_data_state();
2041
8
  CHECK(data_state == TABLET_DATA_DELETED ||
2042
0
        data_state == TABLET_DATA_TOMBSTONED ||
2043
0
        data_state == TABLET_DATA_COPYING ||
2044
0
        data_state == TABLET_DATA_INIT_STARTED)
2045
0
      << "Unexpected TabletDataState in tablet " << tablet_id << ": "
2046
0
      << TabletDataState_Name(data_state) << " (" << data_state << ")";
2047
2048
8
  if (data_state == TABLET_DATA_COPYING) {
2049
    // We tombstone tablets that failed to remotely bootstrap.
2050
0
    data_state = TABLET_DATA_TOMBSTONED;
2051
0
  }
2052
2053
8
  if (data_state == TABLET_DATA_INIT_STARTED) {
2054
    // We delete tablets that failed to completely initialize after a split.
2055
    // TODO(tsplit): https://github.com/yugabyte/yugabyte-db/issues/8013
2056
1
    data_state = TABLET_DATA_DELETED;
2057
1
  }
2058
2059
8
  const string kLogPrefix = TabletLogPrefix(tablet_id);
2060
2061
  // If the tablet is already fully tombstoned with no remaining data or WAL,
2062
  // then no need to roll anything forward.
2063
8
  bool skip_deletion = meta->IsTombstonedWithNoRocksDBData() &&
2064
8
                       
!Log::HasOnDiskData(meta->fs_manager(), meta->wal_dir())2
;
2065
2066
8
  LOG_IF(WARNING, !skip_deletion)
2067
6
      << kLogPrefix << "Tablet Manager startup: Rolling forward tablet deletion "
2068
6
      << "of type " << TabletDataState_Name(data_state);
2069
2070
8
  if (!skip_deletion) {
2071
    // Passing no OpId will retain the last_logged_opid that was previously in the metadata.
2072
6
    RETURN_NOT_OK(DeleteTabletData(meta, data_state, fs_manager_->uuid(), yb::OpId()));
2073
6
  }
2074
2075
  // We only delete the actual superblock of a TABLET_DATA_DELETED tablet on startup.
2076
  // TODO: Consider doing this after a fixed delay, instead of waiting for a restart.
2077
  // See KUDU-941.
2078
8
  if (data_state == TABLET_DATA_DELETED) {
2079
6
    LOG(INFO) << kLogPrefix << "Deleting tablet superblock";
2080
6
    return meta->DeleteSuperBlock();
2081
6
  }
2082
2083
  // Register TOMBSTONED tablets so that they get reported to the Master, which
2084
  // allows us to permanently delete replica tombstones when a table gets deleted.
2085
2
  if (data_state == TABLET_DATA_TOMBSTONED) {
2086
2
    RETURN_NOT_OK(CreateAndRegisterTabletPeer(meta, NEW_PEER));
2087
2
  }
2088
2089
2
  return Status::OK();
2090
2
}
2091
2092
void TSTabletManager::GetAndRegisterDataAndWalDir(FsManager* fs_manager,
2093
                                                  const string& table_id,
2094
                                                  const string& tablet_id,
2095
                                                  string* data_root_dir,
2096
141k
                                                  string* wal_root_dir) {
2097
  // Skip sys catalog table and kudu table from modifying the map.
2098
141k
  if (table_id == master::kSysCatalogTableId) {
2099
0
    return;
2100
0
  }
2101
141k
  LOG(INFO) << "Get and update data/wal directory assignment map for table: " \
2102
141k
            << table_id << " and tablet " << tablet_id;
2103
141k
  std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_);
2104
  // Initialize the map if the directory mapping does not exist.
2105
141k
  auto data_root_dirs = fs_manager->GetDataRootDirs();
2106
18.4E
  CHECK(!data_root_dirs.empty()) << "No data root directories found";
2107
141k
  auto table_data_assignment_iter = table_data_assignment_map_.find(table_id);
2108
141k
  if (table_data_assignment_iter == table_data_assignment_map_.end()) {
2109
23.4k
    for (string data_root_iter : data_root_dirs) {
2110
23.4k
      unordered_set<string> tablet_id_set;
2111
23.4k
      table_data_assignment_map_[table_id][data_root_iter] = tablet_id_set;
2112
23.4k
    }
2113
23.3k
  }
2114
  // Find the data directory with the least count of tablets for this table.
2115
141k
  table_data_assignment_iter = table_data_assignment_map_.find(table_id);
2116
141k
  auto data_assignment_value_map = table_data_assignment_iter->second;
2117
141k
  string min_dir;
2118
141k
  uint64_t min_dir_count = kuint64max;
2119
284k
  for (auto it = data_assignment_value_map.begin(); it != data_assignment_value_map.end(); 
++it142k
) {
2120
142k
    if (min_dir_count > it->second.size()) {
2121
142k
      min_dir = it->first;
2122
142k
      min_dir_count = it->second.size();
2123
142k
    }
2124
142k
  }
2125
141k
  *data_root_dir = min_dir;
2126
  // Increment the count for min_dir.
2127
141k
  auto data_assignment_value_iter = table_data_assignment_map_[table_id].find(min_dir);
2128
141k
  data_assignment_value_iter->second.insert(tablet_id);
2129
2130
  // Find the wal directory with the least count of tablets for this table.
2131
141k
  min_dir = "";
2132
141k
  min_dir_count = kuint64max;
2133
141k
  auto wal_root_dirs = fs_manager->GetWalRootDirs();
2134
18.4E
  CHECK(!wal_root_dirs.empty()) << "No wal root directories found";
2135
141k
  auto table_wal_assignment_iter = table_wal_assignment_map_.find(table_id);
2136
141k
  if (table_wal_assignment_iter == table_wal_assignment_map_.end()) {
2137
23.4k
    for (string wal_root_iter : wal_root_dirs) {
2138
23.4k
      unordered_set<string> tablet_id_set;
2139
23.4k
      table_wal_assignment_map_[table_id][wal_root_iter] = tablet_id_set;
2140
23.4k
    }
2141
23.3k
  }
2142
141k
  table_wal_assignment_iter = table_wal_assignment_map_.find(table_id);
2143
141k
  auto wal_assignment_value_map = table_wal_assignment_iter->second;
2144
284k
  for (auto it = wal_assignment_value_map.begin(); it != wal_assignment_value_map.end(); 
++it142k
) {
2145
142k
    if (min_dir_count > it->second.size()) {
2146
142k
      min_dir = it->first;
2147
142k
      min_dir_count = it->second.size();
2148
142k
    }
2149
142k
  }
2150
141k
  *wal_root_dir = min_dir;
2151
141k
  auto wal_assignment_value_iter = table_wal_assignment_map_[table_id].find(min_dir);
2152
141k
  wal_assignment_value_iter->second.insert(tablet_id);
2153
141k
}
2154
2155
void TSTabletManager::RegisterDataAndWalDir(FsManager* fs_manager,
2156
                                            const string& table_id,
2157
                                            const string& tablet_id,
2158
                                            const string& data_root_dir,
2159
497
                                            const string& wal_root_dir) {
2160
  // Skip sys catalog table from modifying the map.
2161
497
  if (table_id == master::kSysCatalogTableId) {
2162
0
    return;
2163
0
  }
2164
497
  LOG(INFO) << "Update data/wal directory assignment map for table: "
2165
497
            << table_id << " and tablet " << tablet_id;
2166
497
  std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_);
2167
  // Initialize the map if the directory mapping does not exist.
2168
497
  auto data_root_dirs = fs_manager->GetDataRootDirs();
2169
497
  CHECK
(!data_root_dirs.empty()) << "No data root directories found"0
;
2170
497
  auto table_data_assignment_iter = table_data_assignment_map_.find(table_id);
2171
497
  if (table_data_assignment_iter == table_data_assignment_map_.end()) {
2172
116
    for (string data_root_iter : data_root_dirs) {
2173
116
      unordered_set<string> tablet_id_set;
2174
116
      table_data_assignment_map_[table_id][data_root_iter] = tablet_id_set;
2175
116
    }
2176
79
  }
2177
  // Increment the count for data_root_dir.
2178
497
  table_data_assignment_iter = table_data_assignment_map_.find(table_id);
2179
497
  auto data_assignment_value_map = table_data_assignment_iter->second;
2180
497
  auto data_assignment_value_iter = table_data_assignment_map_[table_id].find(data_root_dir);
2181
497
  if (data_assignment_value_iter == table_data_assignment_map_[table_id].end()) {
2182
0
    unordered_set<string> tablet_id_set;
2183
0
    tablet_id_set.insert(tablet_id);
2184
0
    table_data_assignment_map_[table_id][data_root_dir] = tablet_id_set;
2185
497
  } else {
2186
497
    data_assignment_value_iter->second.insert(tablet_id);
2187
497
  }
2188
2189
497
  auto wal_root_dirs = fs_manager->GetWalRootDirs();
2190
497
  CHECK
(!wal_root_dirs.empty()) << "No wal root directories found"0
;
2191
497
  auto table_wal_assignment_iter = table_wal_assignment_map_.find(table_id);
2192
497
  if (table_wal_assignment_iter == table_wal_assignment_map_.end()) {
2193
116
    for (string wal_root_iter : wal_root_dirs) {
2194
116
      unordered_set<string> tablet_id_set;
2195
116
      table_wal_assignment_map_[table_id][wal_root_iter] = tablet_id_set;
2196
116
    }
2197
79
  }
2198
  // Increment the count for wal_root_dir.
2199
497
  table_wal_assignment_map_[table_id][wal_root_dir].insert(tablet_id);
2200
497
  table_wal_assignment_iter = table_wal_assignment_map_.find(table_id);
2201
497
  auto wal_assignment_value_map = table_wal_assignment_iter->second;
2202
497
  auto wal_assignment_value_iter = table_wal_assignment_map_[table_id].find(wal_root_dir);
2203
497
  if (wal_assignment_value_iter == table_wal_assignment_map_[table_id].end()) {
2204
0
    unordered_set<string> tablet_id_set;
2205
0
    tablet_id_set.insert(tablet_id);
2206
0
    table_wal_assignment_map_[table_id][wal_root_dir] = tablet_id_set;
2207
497
  } else {
2208
497
    wal_assignment_value_iter->second.insert(tablet_id);
2209
497
  }
2210
497
}
2211
2212
TSTabletManager::TableDiskAssignmentMap* TSTabletManager::GetTableDiskAssignmentMapUnlocked(
2213
136
    TabletDirType dir_type) {
2214
136
  switch (dir_type) {
2215
68
    case TabletDirType::kData:
2216
68
      return &table_data_assignment_map_;
2217
68
    case TabletDirType::kWal:
2218
68
      return &table_wal_assignment_map_;
2219
136
  }
2220
0
  FATAL_INVALID_ENUM_VALUE(TabletDirType, dir_type);
2221
0
}
2222
2223
Result<const std::string&> TSTabletManager::GetAssignedRootDirForTablet(
2224
136
    TabletDirType dir_type, const TableId& table_id, const TabletId& tablet_id) {
2225
136
  std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_);
2226
2227
136
  TableDiskAssignmentMap* table_assignment_map = GetTableDiskAssignmentMapUnlocked(dir_type);
2228
136
  auto tablets_by_root_dir = table_assignment_map->find(table_id);
2229
136
  if (tablets_by_root_dir == table_assignment_map->end()) {
2230
0
    return STATUS_FORMAT(
2231
0
        IllegalState, "Table ID $0 is not in $1 table assignment map", table_id, dir_type);
2232
0
  }
2233
136
  for (auto& data_dir_and_tablets : tablets_by_root_dir->second) {
2234
136
    if (data_dir_and_tablets.second.count(tablet_id) > 0) {
2235
136
      return data_dir_and_tablets.first;
2236
136
    }
2237
136
  }
2238
0
  return STATUS_FORMAT(
2239
136
      IllegalState, "Tablet ID $0 is not found in $1 assignment map for table $2", tablet_id,
2240
136
      dir_type, table_id);
2241
136
}
2242
2243
void TSTabletManager::UnregisterDataWalDir(const string& table_id,
2244
                                           const string& tablet_id,
2245
                                           const string& data_root_dir,
2246
75.5k
                                           const string& wal_root_dir) {
2247
  // Skip sys catalog table from modifying the map.
2248
75.5k
  if (table_id == master::kSysCatalogTableId) {
2249
0
    return;
2250
0
  }
2251
75.5k
  LOG(INFO) << "Unregister data/wal directory assignment map for table: "
2252
75.5k
            << table_id << " and tablet " << tablet_id;
2253
75.5k
  std::lock_guard<std::mutex> lock(dir_assignment_mutex_);
2254
75.5k
  auto table_data_assignment_iter = table_data_assignment_map_.find(table_id);
2255
75.5k
  if (table_data_assignment_iter == table_data_assignment_map_.end()) {
2256
    // It is possible that we can't find an assignment for the table if the operations followed in
2257
    // this order:
2258
    // 1. The only tablet for a table gets tombstoned, and UnregisterDataWalDir removes it from
2259
    //    the maps.
2260
    // 2. TSTabletManager gets restarted (so the maps are cleared).
2261
    // 3. During TsTabletManager initialization, the tombstoned TABLET won't get registered,
2262
    //    so if a DeleteTablet request with type DELETED gets sent, UnregisterDataWalDir won't
2263
    //    find the table.
2264
2265
    // Check that both maps should be consistent.
2266
0
    DCHECK(table_wal_assignment_map_.find(table_id) == table_wal_assignment_map_.end());
2267
0
  }
2268
75.5k
  if (
table_data_assignment_iter != table_data_assignment_map_.end()75.5k
) {
2269
75.5k
    auto data_assignment_value_iter = table_data_assignment_map_[table_id].find(data_root_dir);
2270
75.5k
    DCHECK(data_assignment_value_iter != table_data_assignment_map_[table_id].end())
2271
0
      << "No data directory index found for table: " << table_id;
2272
75.5k
    if (data_assignment_value_iter != table_data_assignment_map_[table_id].end()) {
2273
75.5k
      data_assignment_value_iter->second.erase(tablet_id);
2274
75.5k
    } else {
2275
0
      LOG(WARNING) << "Tablet " << tablet_id << " not in the set for data directory "
2276
0
                   << data_root_dir << "for table " << table_id;
2277
0
    }
2278
75.5k
  }
2279
75.5k
  auto table_wal_assignment_iter = table_wal_assignment_map_.find(table_id);
2280
75.5k
  if (
table_wal_assignment_iter != table_wal_assignment_map_.end()75.5k
) {
2281
75.5k
    auto wal_assignment_value_iter = table_wal_assignment_map_[table_id].find(wal_root_dir);
2282
75.5k
    DCHECK(wal_assignment_value_iter != table_wal_assignment_map_[table_id].end())
2283
0
      << "No wal directory index found for table: " << table_id;
2284
75.5k
    if (wal_assignment_value_iter != table_wal_assignment_map_[table_id].end()) {
2285
75.5k
      wal_assignment_value_iter->second.erase(tablet_id);
2286
75.5k
    } else {
2287
0
      LOG(WARNING) << "Tablet " << tablet_id << " not in the set for wal directory "
2288
0
                   << wal_root_dir << "for table " << table_id;
2289
0
    }
2290
75.5k
  }
2291
75.5k
}
2292
2293
6
client::YBClient& TSTabletManager::client() {
2294
6
  return *async_client_init_->client();
2295
6
}
2296
2297
83.7k
const std::shared_future<client::YBClient*>& TSTabletManager::client_future() {
2298
83.7k
  return async_client_init_->get_client_future();
2299
83.7k
}
2300
2301
1.96k
void TSTabletManager::MaybeDoChecksForTests(const TableId& table_id) {
2302
  // First check that the global RBS limits are respected if the flag is non-zero.
2303
1.96k
  if (PREDICT_FALSE(FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than > 0) &&
2304
1.96k
      tablets_being_remote_bootstrapped_.size() >
2305
24
          FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than) {
2306
0
    string tablets;
2307
    // The purpose of limiting the number of remote bootstraps is to cap how much
2308
    // network bandwidth all the RBS sessions use.
2309
    // When we finish transferring the files, we wait until the role of the new peer
2310
    // has been changed from PRE_VOTER to VOTER before we remove the tablet_id
2311
    // from tablets_being_remote_bootstrapped_. Since it's possible to be here
2312
    // because a few tablets are already open, and in the RUNNING state, but still
2313
    // in the tablets_being_remote_bootstrapped_ list, we check the state of each
2314
    // tablet before deciding if the load balancer has violated the concurrent RBS limit.
2315
0
    size_t count = 0;
2316
0
    for (const auto& tablet_id : tablets_being_remote_bootstrapped_) {
2317
0
      TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, tablet_id);
2318
0
      if (tablet_peer && (*tablet_peer)->state() == RaftGroupStatePB::RUNNING) {
2319
0
        continue;
2320
0
      }
2321
0
      if (!tablets.empty()) {
2322
0
        tablets += ", ";
2323
0
      }
2324
0
      tablets += tablet_id;
2325
0
      count++;
2326
0
    }
2327
0
    if (count > FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than) {
2328
0
      LOG(FATAL) << "Exceeded the specified maximum number of concurrent remote bootstrap sessions."
2329
0
                 << " Specified: " << FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than
2330
0
                 << ", number concurrent remote bootstrap sessions: "
2331
0
                 << tablets_being_remote_bootstrapped_.size() << ", for tablets: " << tablets;
2332
0
    }
2333
0
  }
2334
2335
  // Check that the per-table RBS limits are respected if the flag is non-zero.
2336
1.96k
  if (PREDICT_FALSE(FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than > 0) &&
2337
1.96k
      tablets_being_remote_bootstrapped_per_table_[table_id].size() >
2338
24
          FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than) {
2339
0
    string tablets;
2340
0
    size_t count = 0;
2341
0
    for (const auto& tablet_id : tablets_being_remote_bootstrapped_per_table_[table_id]) {
2342
0
      TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, tablet_id);
2343
0
      if (tablet_peer && (*tablet_peer)->state() == RaftGroupStatePB::RUNNING) {
2344
0
        continue;
2345
0
      }
2346
0
      if (!tablets.empty()) {
2347
0
        tablets += ", ";
2348
0
      }
2349
0
      tablets += tablet_id;
2350
0
      count++;
2351
0
    }
2352
0
    if (count > FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than) {
2353
0
      LOG(FATAL) << "Exceeded the specified maximum number of concurrent remote bootstrap "
2354
0
                 << "sessions per table. Specified: "
2355
0
                 << FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than
2356
0
                 << ", number of concurrent remote bootstrap sessions for table " << table_id
2357
0
                 << ": " << tablets_being_remote_bootstrapped_per_table_[table_id].size()
2358
0
                 << ", for tablets: " << tablets;
2359
0
    }
2360
0
  }
2361
1.96k
}
2362
2363
4.82M
Status TSTabletManager::UpdateSnapshotsInfo(const master::TSSnapshotsInfoPB& info) {
2364
4.82M
  bool restorations_updated;
2365
4.82M
  RestorationCompleteTimeMap restoration_complete_time;
2366
4.82M
  {
2367
4.82M
    std::lock_guard<simple_spinlock> lock(snapshot_schedule_allowed_history_cutoff_mutex_);
2368
4.82M
    ++snapshot_schedules_version_;
2369
4.82M
    snapshot_schedule_allowed_history_cutoff_.clear();
2370
4.82M
    for (const auto& schedule : info.schedules()) {
2371
454
      auto schedule_id = VERIFY_RESULT(FullyDecodeSnapshotScheduleId(schedule.id()));
2372
0
      snapshot_schedule_allowed_history_cutoff_.emplace(
2373
454
          schedule_id, HybridTime::FromPB(schedule.last_snapshot_hybrid_time()));
2374
454
      missing_snapshot_schedules_.erase(schedule_id);
2375
454
    }
2376
4.82M
    HybridTime restorations_update_ht(info.last_restorations_update_ht());
2377
4.82M
    restorations_updated = restorations_update_ht != last_restorations_update_ht_;
2378
4.82M
    if (restorations_updated) {
2379
8.16k
      last_restorations_update_ht_ = restorations_update_ht;
2380
8.16k
      for (const auto& entry : info.restorations()) {
2381
15
        auto id = VERIFY_RESULT(FullyDecodeTxnSnapshotRestorationId(entry.id()));
2382
0
        auto complete_time = HybridTime::FromPB(entry.complete_time_ht());
2383
15
        restoration_complete_time.emplace(id, complete_time);
2384
15
      }
2385
8.16k
    }
2386
4.82M
  }
2387
4.82M
  if (!restorations_updated) {
2388
4.81M
    return Status::OK();
2389
4.81M
  }
2390
8.16k
  std::vector<tablet::TabletPtr> tablets;
2391
8.16k
  {
2392
8.16k
    SharedLock<RWMutex> shared_lock(mutex_);
2393
8.16k
    tablets.reserve(tablet_map_.size());
2394
8.16k
    for (const auto& entry : tablet_map_) {
2395
2.97k
      auto tablet = entry.second->shared_tablet();
2396
2.97k
      if (tablet) {
2397
2.71k
        tablets.push_back(tablet);
2398
2.71k
      }
2399
2.97k
    }
2400
8.16k
  }
2401
8.16k
  for (const auto& tablet : tablets) {
2402
2.71k
    RETURN_NOT_OK(tablet->CheckRestorations(restoration_complete_time));
2403
2.71k
  }
2404
8.16k
  return Status::OK();
2405
8.16k
}
2406
2407
464
HybridTime TSTabletManager::AllowedHistoryCutoff(tablet::RaftGroupMetadata* metadata) {
2408
464
  auto schedules = metadata->SnapshotSchedules();
2409
464
  if (schedules.empty()) {
2410
464
    return HybridTime::kMax;
2411
464
  }
2412
0
  std::vector<SnapshotScheduleId> schedules_to_remove;
2413
0
  auto se = ScopeExit([&schedules_to_remove, metadata]() {
2414
0
    if (schedules_to_remove.empty()) {
2415
0
      return;
2416
0
    }
2417
0
    bool any_removed = false;
2418
0
    for (const auto& schedule_id : schedules_to_remove) {
2419
0
      any_removed = metadata->RemoveSnapshotSchedule(schedule_id) || any_removed;
2420
0
    }
2421
0
    if (any_removed) {
2422
0
      WARN_NOT_OK(metadata->Flush(), "Failed to flush metadata");
2423
0
    }
2424
0
  });
2425
0
  std::lock_guard<simple_spinlock> lock(snapshot_schedule_allowed_history_cutoff_mutex_);
2426
0
  HybridTime result = HybridTime::kMax;
2427
0
  for (const auto& schedule_id : schedules) {
2428
0
    auto it = snapshot_schedule_allowed_history_cutoff_.find(schedule_id);
2429
0
    if (it == snapshot_schedule_allowed_history_cutoff_.end()) {
2430
      // We don't know this schedule.
2431
0
      auto emplace_result = missing_snapshot_schedules_.emplace(
2432
0
          schedule_id, snapshot_schedules_version_);
2433
0
      if (!emplace_result.second &&
2434
0
          emplace_result.first->second + 2 <= snapshot_schedules_version_) {
2435
        // We don't know this schedule, and there are already 2 rounds of heartbeat passed
2436
        // after we first time found that we don't know this schedule.
2437
        // So it means that schedule was deleted.
2438
        // One round is not enough, because schedule could be added after heartbeat processed on
2439
        // master, but response not yet received on TServer.
2440
0
        schedules_to_remove.push_back(schedule_id);
2441
0
        continue;
2442
0
      }
2443
0
      return HybridTime::kMin;
2444
0
    }
2445
0
    if (!it->second) {
2446
      // Schedules does not have snapshots yet.
2447
0
      return HybridTime::kMin;
2448
0
    }
2449
0
    result = std::min(result, it->second);
2450
0
  }
2451
0
  return result;
2452
0
}
2453
2454
Status DeleteTabletData(const RaftGroupMetadataPtr& meta,
2455
                        TabletDataState data_state,
2456
                        const string& uuid,
2457
                        const yb::OpId& last_logged_opid,
2458
75.5k
                        TSTabletManager* ts_manager) {
2459
75.5k
  const string& tablet_id = meta->raft_group_id();
2460
75.5k
  const string kLogPrefix = LogPrefix(tablet_id, uuid);
2461
75.5k
  LOG(INFO) << kLogPrefix << "Deleting tablet data with delete state "
2462
75.5k
            << TabletDataState_Name(data_state);
2463
75.5k
  CHECK(data_state == TABLET_DATA_DELETED ||
2464
0
        data_state == TABLET_DATA_TOMBSTONED)
2465
0
      << "Unexpected data_state to delete tablet " << meta->raft_group_id() << ": "
2466
0
      << TabletDataState_Name(data_state) << " (" << data_state << ")";
2467
2468
  // Note: Passing an unset 'last_logged_opid' will retain the last_logged_opid
2469
  // that was previously in the metadata.
2470
75.5k
  RETURN_NOT_OK(meta->DeleteTabletData(data_state, last_logged_opid));
2471
75.5k
  LOG(INFO) << kLogPrefix << "Tablet deleted. Last logged OpId: "
2472
75.5k
            << meta->tombstone_last_logged_opid();
2473
75.5k
  MAYBE_FAULT(FLAGS_TEST_fault_crash_after_blocks_deleted);
2474
2475
75.5k
  RETURN_NOT_OK(Log::DeleteOnDiskData(
2476
75.5k
      meta->fs_manager()->env(), meta->raft_group_id(), meta->wal_dir(),
2477
75.5k
      meta->fs_manager()->uuid()));
2478
75.5k
  MAYBE_FAULT(FLAGS_TEST_fault_crash_after_wal_deleted);
2479
2480
  // We do not delete the superblock or the consensus metadata when tombstoning
2481
  // a tablet.
2482
75.5k
  if (data_state == TABLET_DATA_TOMBSTONED) {
2483
1.59k
    return Status::OK();
2484
1.59k
  }
2485
2486
  // Only TABLET_DATA_DELETED tablets get this far.
2487
73.9k
  RETURN_NOT_OK(ConsensusMetadata::DeleteOnDiskData(meta->fs_manager(), meta->raft_group_id()));
2488
73.9k
  MAYBE_FAULT(FLAGS_TEST_fault_crash_after_cmeta_deleted);
2489
2490
73.9k
  return Status::OK();
2491
73.9k
}
2492
2493
void LogAndTombstone(const RaftGroupMetadataPtr& meta,
2494
                     const std::string& msg,
2495
                     const std::string& uuid,
2496
                     const Status& s,
2497
3
                     TSTabletManager* ts_manager) {
2498
3
  const string& tablet_id = meta->raft_group_id();
2499
3
  const string kLogPrefix = LogPrefix(tablet_id, uuid);
2500
3
  LOG(WARNING) << kLogPrefix << msg << ": " << s.ToString();
2501
2502
  // Tombstone the tablet when remote bootstrap fails.
2503
3
  LOG(INFO) << kLogPrefix << "Tombstoning tablet after failed remote bootstrap";
2504
3
  Status delete_status = DeleteTabletData(meta,
2505
3
                                          TABLET_DATA_TOMBSTONED,
2506
3
                                          uuid,
2507
3
                                          yb::OpId(),
2508
3
                                          ts_manager);
2509
2510
3
  if (PREDICT_FALSE(FLAGS_TEST_sleep_after_tombstoning_tablet_secs > 0)) {
2511
    // We sleep here so that the test can verify that the state of the tablet is
2512
    // TABLET_DATA_TOMBSTONED.
2513
0
    LOG(INFO) << "Sleeping after remote bootstrap failed";
2514
0
    SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_sleep_after_tombstoning_tablet_secs));
2515
0
  }
2516
2517
3
  if (PREDICT_FALSE(!delete_status.ok())) {
2518
    // This failure should only either indicate a bug or an IO error.
2519
0
    LOG(FATAL) << kLogPrefix << "Failed to tombstone tablet after remote bootstrap: "
2520
0
               << delete_status.ToString();
2521
0
  }
2522
2523
  // Remove the child tracker if present.
2524
3
  if (ts_manager != nullptr) {
2525
3
    auto tracker = MemTracker::FindTracker(
2526
3
        Format("tablet-$0", meta->raft_group_id()), ts_manager->server()->mem_tracker());
2527
3
    if (tracker) {
2528
0
      tracker->UnregisterFromParent();
2529
0
    }
2530
3
  }
2531
3
}
2532
2533
TransitionInProgressDeleter::TransitionInProgressDeleter(
2534
    TransitionInProgressMap* map, std::mutex* mutex, const TabletId& tablet_id)
2535
217k
    : in_progress_(map), mutex_(mutex), tablet_id_(tablet_id) {}
2536
2537
217k
TransitionInProgressDeleter::~TransitionInProgressDeleter() {
2538
217k
  std::string transition;
2539
217k
  {
2540
217k
    std::unique_lock<std::mutex> lock(*mutex_);
2541
217k
    const auto iter = in_progress_->find(tablet_id_);
2542
217k
    CHECK(iter != in_progress_->end());
2543
217k
    transition = iter->second;
2544
217k
    in_progress_->erase(iter);
2545
217k
  }
2546
217k
  LOG(INFO) << "Deleted transition in progress " << transition
2547
217k
            << " for tablet " << tablet_id_;
2548
217k
}
2549
2550
Status ShutdownAndTombstoneTabletPeerNotOk(
2551
    const Status& status, const tablet::TabletPeerPtr& tablet_peer,
2552
    const tablet::RaftGroupMetadataPtr& meta, const std::string& uuid, const char* msg,
2553
2.01k
    TSTabletManager* ts_tablet_manager) {
2554
2.01k
  if (status.ok()) {
2555
2.01k
    return status;
2556
2.01k
  }
2557
  // If shutdown was initiated by someone else we should not wait for shutdown to complete.
2558
0
  if (tablet_peer && tablet_peer->StartShutdown()) {
2559
0
    tablet_peer->CompleteShutdown();
2560
0
  }
2561
0
  tserver::LogAndTombstone(meta, msg, uuid, status, ts_tablet_manager);
2562
0
  return status;
2563
2.01k
}
2564
2565
} // namespace tserver
2566
} // namespace yb