YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/integration-tests/remote_bootstrap-itest.cc
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// The following only applies to changes made to this file as part of YugaByte development.
19
//
20
// Portions Copyright (c) YugaByte, Inc.
21
//
22
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
23
// in compliance with the License.  You may obtain a copy of the License at
24
//
25
// http://www.apache.org/licenses/LICENSE-2.0
26
//
27
// Unless required by applicable law or agreed to in writing, software distributed under the License
28
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
29
// or implied.  See the License for the specific language governing permissions and limitations
30
// under the License.
31
//
32
33
#include <memory>
34
#include <string>
35
#include <unordered_map>
36
37
#include <boost/optional.hpp>
38
#include <gtest/gtest.h>
39
40
#include "yb/client/client-test-util.h"
41
#include "yb/client/client.h"
42
#include "yb/client/schema.h"
43
#include "yb/client/table_creator.h"
44
45
#include "yb/common/wire_protocol-test-util.h"
46
47
#include "yb/fs/fs_manager.h"
48
49
#include "yb/gutil/stl_util.h"
50
#include "yb/gutil/strings/substitute.h"
51
52
#include "yb/integration-tests/cluster_itest_util.h"
53
#include "yb/integration-tests/cluster_verifier.h"
54
#include "yb/integration-tests/external_mini_cluster.h"
55
#include "yb/integration-tests/external_mini_cluster_fs_inspector.h"
56
#include "yb/integration-tests/test_workload.h"
57
58
#include "yb/tablet/tablet_bootstrap_if.h"
59
#include "yb/tablet/tablet_metadata.h"
60
61
#include "yb/tserver/remote_bootstrap_client.h"
62
#include "yb/tserver/remote_bootstrap_session.h"
63
#include "yb/tserver/tserver.pb.h"
64
65
#include "yb/util/metrics.h"
66
#include "yb/util/pb_util.h"
67
#include "yb/util/pstack_watcher.h"
68
#include "yb/util/status_log.h"
69
#include "yb/util/test_util.h"
70
#include "yb/util/tsan_util.h"
71
72
using namespace std::literals;
73
74
DEFINE_int32(test_delete_leader_num_iters, 3,
75
             "Number of iterations to run in TestDeleteLeaderDuringRemoteBootstrapStressTest.");
76
DEFINE_int32(test_delete_leader_min_rows_per_iter, 200,
77
             "Minimum number of rows to insert per iteration "
78
              "in TestDeleteLeaderDuringRemoteBootstrapStressTest.");
79
DEFINE_int32(test_delete_leader_payload_bytes, 16 * 1024,
80
             "Payload byte size in TestDeleteLeaderDuringRemoteBootstrapStressTest.");
81
DEFINE_int32(test_delete_leader_num_writer_threads, 1,
82
             "Number of writer threads in TestDeleteLeaderDuringRemoteBootstrapStressTest.");
83
DEFINE_int32(remote_bootstrap_itest_timeout_sec, 180,
84
             "Timeout in seconds to use in remote bootstrap integration test.");
85
86
using yb::client::YBClient;
87
using yb::client::YBClientBuilder;
88
using yb::client::YBSchema;
89
using yb::client::YBSchemaFromSchema;
90
using yb::client::YBTableCreator;
91
using yb::client::YBTableType;
92
using yb::client::YBTableName;
93
using std::shared_ptr;
94
using yb::consensus::CONSENSUS_CONFIG_COMMITTED;
95
using yb::consensus::PeerMemberType;
96
using yb::consensus::RaftPeerPB;
97
using yb::itest::TServerDetails;
98
using yb::tablet::TABLET_DATA_READY;
99
using yb::tablet::TABLET_DATA_TOMBSTONED;
100
using yb::tserver::ListTabletsResponsePB;
101
using yb::tserver::RemoteBootstrapClient;
102
using std::string;
103
using std::unordered_map;
104
using std::vector;
105
using strings::Substitute;
106
107
METRIC_DECLARE_entity(server);
108
METRIC_DECLARE_histogram(handler_latency_yb_consensus_ConsensusService_UpdateConsensus);
109
METRIC_DECLARE_counter(glog_info_messages);
110
METRIC_DECLARE_counter(glog_warning_messages);
111
METRIC_DECLARE_counter(glog_error_messages);
112
113
namespace yb {
114
115
using yb::tablet::TabletDataState;
116
117
class RemoteBootstrapITest : public YBTest {
118
 public:
119
4
  void TearDown() override {
120
4
    client_.reset();
121
4
    if (HasFatalFailure()) {
122
0
      LOG(INFO) << "Found fatal failure";
123
0
      if (cluster_) {
124
0
        for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) {
125
0
          if (!cluster_->tablet_server(i)->IsProcessAlive()) {
126
0
            LOG(INFO) << "Tablet server " << i << " is not running. Cannot dump its stacks.";
127
0
            continue;
128
0
          }
129
0
          LOG(INFO) << "Attempting to dump stacks of TS " << i
130
0
                    << " with UUID " << cluster_->tablet_server(i)->uuid()
131
0
                    << " and pid " << cluster_->tablet_server(i)->pid();
132
0
          WARN_NOT_OK(PstackWatcher::DumpPidStacks(cluster_->tablet_server(i)->pid()),
133
0
                      "Couldn't dump stacks");
134
0
        }
135
0
      }
136
4
    } else if (check_checkpoints_cleared_ && cluster_) {
137
1
      CheckCheckpointsCleared();
138
1
    }
139
4
    if (cluster_) {
140
1
      cluster_->Shutdown();
141
1
    }
142
4
    YBTest::TearDown();
143
4
    ts_map_.clear();
144
4
  }
145
146
 protected:
147
  void StartCluster(const vector<string>& extra_tserver_flags = vector<string>(),
148
                    const vector<string>& extra_master_flags = vector<string>(),
149
                    int num_tablet_servers = 3);
150
151
  void RejectRogueLeader(YBTableType table_type);
152
  void DeleteTabletDuringRemoteBootstrap(YBTableType table_type);
153
  void RemoteBootstrapFollowerWithHigherTerm(YBTableType table_type);
154
  void ConcurrentRemoteBootstraps(YBTableType table_type);
155
  void DeleteLeaderDuringRemoteBootstrapStressTest(YBTableType table_type);
156
  void DisableRemoteBootstrap_NoTightLoopWhenTabletDeleted(YBTableType table_type);
157
158
  void CrashTestSetUp(YBTableType table_type);
159
  void CrashTestVerify();
160
  // The following tests verify that a newly added tserver gets bootstrapped even if the leader
161
  // crashes while bootstrapping it.
162
  void LeaderCrashesWhileFetchingData(YBTableType table_type);
163
  void LeaderCrashesBeforeChangeRole(YBTableType table_type);
164
  void LeaderCrashesAfterChangeRole(YBTableType table_type);
165
166
  void ClientCrashesBeforeChangeRole(YBTableType table_type);
167
168
  void StartCrashedTabletServer(
169
      TabletDataState expected_data_state = TabletDataState::TABLET_DATA_TOMBSTONED);
170
  void CheckCheckpointsCleared();
171
172
  void CreateTableAssignLeaderAndWaitForTabletServersReady(const YBTableType table_type,
173
                                                           const YBTableName& table_name,
174
                                                           const int num_tablets,
175
                                                           const int expected_num_tablets_per_ts,
176
                                                           const int leader_index,
177
                                                           const MonoDelta& timeout,
178
                                                           vector<string>* tablet_ids);
179
180
  std::unique_ptr<ExternalMiniCluster> cluster_;
181
  std::unique_ptr<itest::ExternalMiniClusterFsInspector> inspect_;
182
  std::unique_ptr<YBClient> client_;
183
  itest::TabletServerMap ts_map_;
184
185
  MonoDelta crash_test_timeout_;
186
  const MonoDelta kWaitForCrashTimeout_ = 60s;
187
  vector<string> crash_test_tserver_flags_;
188
  std::unique_ptr<TestWorkload> crash_test_workload_;
189
  TServerDetails* crash_test_leader_ts_ = nullptr;
190
  int crash_test_tserver_index_ = -1;
191
  int crash_test_leader_index_ = -1;
192
  string crash_test_tablet_id_;
193
  bool check_checkpoints_cleared_ = true;
194
};
195
196
void RemoteBootstrapITest::StartCluster(const vector<string>& extra_tserver_flags,
197
                                        const vector<string>& extra_master_flags,
198
11
                                        int num_tablet_servers) {
199
11
  ExternalMiniClusterOptions opts;
200
11
  opts.num_tablet_servers = num_tablet_servers;
201
11
  opts.extra_tserver_flags = extra_tserver_flags;
202
11
  opts.extra_tserver_flags.emplace_back("--remote_bootstrap_idle_timeout_ms=10000");
203
11
  opts.extra_tserver_flags.emplace_back("--never_fsync"); // fsync causes flakiness on EC2.
204
11
  if (IsTsan()) {
205
0
    opts.extra_tserver_flags.emplace_back("--leader_failure_max_missed_heartbeat_periods=20");
206
0
    opts.extra_tserver_flags.emplace_back("--remote_bootstrap_begin_session_timeout_ms=13000");
207
0
    opts.extra_tserver_flags.emplace_back("--rpc_connection_timeout_ms=10000");
208
11
  } else if (IsSanitizer()) {
209
0
    opts.extra_tserver_flags.emplace_back("--leader_failure_max_missed_heartbeat_periods=15");
210
0
    opts.extra_tserver_flags.emplace_back("--remote_bootstrap_begin_session_timeout_ms=10000");
211
0
    opts.extra_tserver_flags.emplace_back("--rpc_connection_timeout_ms=7500");
212
0
  }
213
214
11
  opts.extra_master_flags = extra_master_flags;
215
11
  cluster_.reset(new ExternalMiniCluster(opts));
216
11
  ASSERT_OK(cluster_->Start());
217
11
  inspect_.reset(new itest::ExternalMiniClusterFsInspector(cluster_.get()));
218
11
  ts_map_ = ASSERT_RESULT(itest::CreateTabletServerMap(cluster_.get()));
219
11
  client_ = ASSERT_RESULT(cluster_->CreateClient());
220
11
}
221
222
1
void RemoteBootstrapITest::CheckCheckpointsCleared() {
223
1
  auto* env = Env::Default();
224
1
  auto deadline = MonoTime::Now() + 10s * kTimeMultiplier;
225
4
  for (size_t i = 0; i != cluster_->num_tablet_servers(); ++i) {
226
3
    auto tablet_server = cluster_->tablet_server(i);
227
3
    auto data_dir = tablet_server->GetDataDirs()[0];
228
3
    SCOPED_TRACE(Format("Index: $0", i));
229
3
    SCOPED_TRACE(Format("UUID: $0", tablet_server->uuid()));
230
3
    SCOPED_TRACE(Format("Data dir: $0", data_dir));
231
3
    auto meta_dir = FsManager::GetRaftGroupMetadataDir(data_dir);
232
3
    auto tablets = ASSERT_RESULT(env->GetChildren(meta_dir, ExcludeDots::kTrue));
233
72
    for (const auto& tablet : tablets) {
234
72
      SCOPED_TRACE(Format("Tablet: $0", tablet));
235
72
      auto metadata_path = JoinPathSegments(meta_dir, tablet);
236
72
      tablet::RaftGroupReplicaSuperBlockPB superblock;
237
72
      if (!pb_util::ReadPBContainerFromPath(env, metadata_path, &superblock).ok()) {
238
        // There is a race condition between this and a DeleteTablet request. So skip this tablet
239
        // if we cant' read the superblock.
240
0
        continue;
241
0
      }
242
72
      auto checkpoints_dir = JoinPathSegments(
243
72
          superblock.kv_store().rocksdb_dir(), tserver::RemoteBootstrapSession::kCheckpointsDir);
244
72
      ASSERT_OK(Wait([env, checkpoints_dir]() -> bool {
245
72
        if (env->FileExists(checkpoints_dir)) {
246
72
          auto checkpoints = CHECK_RESULT(env->GetChildren(checkpoints_dir, ExcludeDots::kTrue));
247
72
          if (!checkpoints.empty()) {
248
72
            LOG(INFO) << "Checkpoints: " << yb::ToString(checkpoints);
249
72
            return false;
250
72
          }
251
72
        }
252
72
        return true;
253
72
      }, deadline, "Wait checkpoints empty"));
254
72
    }
255
3
  }
256
1
}
257
258
4
void RemoteBootstrapITest::CrashTestSetUp(YBTableType table_type) {
259
4
  crash_test_tserver_flags_.push_back("--log_segment_size_mb=1");  // Faster log rolls.
260
  // Start the cluster with load balancer turned off.
261
4
  vector<string> master_flags;
262
4
  master_flags.push_back("--enable_load_balancing=false");
263
4
  master_flags.push_back("--replication_factor=4");
264
4
  ASSERT_NO_FATALS(StartCluster(crash_test_tserver_flags_, master_flags, 5));
265
4
  crash_test_tserver_index_ = 0;  // We'll test with the first TS.
266
267
4
  LOG(INFO) << "Started cluster";
268
  // We'll do a config change to remote bootstrap a replica here later. For
269
  // now, shut it down.
270
4
  LOG(INFO) << "Shutting down TS " << cluster_->tablet_server(crash_test_tserver_index_)->uuid();
271
4
  cluster_->tablet_server(crash_test_tserver_index_)->Shutdown();
272
273
  // Bounce the Master so it gets new tablet reports and doesn't try to assign
274
  // a replica to the dead TS.
275
4
  cluster_->master()->Shutdown();
276
4
  ASSERT_OK(cluster_->master()->Restart());
277
4
  ASSERT_OK(cluster_->WaitForTabletServerCount(4, crash_test_timeout_));
278
279
  // Start a workload on the cluster, and run it for a little while.
280
4
  crash_test_workload_.reset(new TestWorkload(cluster_.get()));
281
4
  crash_test_workload_->set_sequential_write(true);
282
4
  crash_test_workload_->Setup();
283
4
  ASSERT_OK(inspect_->WaitForReplicaCount(4));
284
285
0
  vector<string> tablets = inspect_->ListTabletsOnTS(1);
286
0
  ASSERT_EQ(1, tablets.size());
287
0
  crash_test_tablet_id_ = tablets[0];
288
289
0
  crash_test_workload_->Start();
290
0
  crash_test_workload_->WaitInserted(100);
291
292
  // Remote bootstrap doesn't see the active WAL segment, and we need to
293
  // download a file to trigger the fault in this test. Due to the log index
294
  // chunks, that means 3 files minimum: One in-flight WAL segment, one index
295
  // chunk file (these files grow much more slowly than the WAL segments), and
296
  // one completed WAL segment.
297
0
  crash_test_leader_ts_ = nullptr;
298
0
  ASSERT_OK(FindTabletLeader(ts_map_, crash_test_tablet_id_, crash_test_timeout_,
299
0
                             &crash_test_leader_ts_));
300
0
  crash_test_leader_index_ = cluster_->tablet_server_index_by_uuid(crash_test_leader_ts_->uuid());
301
0
  ASSERT_NE(-1, crash_test_leader_index_);
302
0
  ASSERT_OK(inspect_->WaitForMinFilesInTabletWalDirOnTS(crash_test_leader_index_,
303
0
                                                        crash_test_tablet_id_, 3));
304
0
  crash_test_workload_->StopAndJoin();
305
0
}
306
307
0
void RemoteBootstrapITest::CrashTestVerify() {
308
  // Wait until the tablet has been tombstoned in TS 0. This will happen after a call to
309
  // rb_client->Finish() tries to ends the remote bootstrap session with the crashed leader. The
310
  // returned error will cause the tablet to be tombstoned by the TOMBSTONE_NOT_OK macro.
311
0
  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(0, crash_test_tablet_id_,
312
0
                                                 TABLET_DATA_TOMBSTONED, crash_test_timeout_));
313
314
  // After crash_test_leader_ts_ crashes, a new leader will be elected. This new leader will detect
315
  // that TS 0 needs to be remote bootstrapped. Verify that this process completes successfully.
316
0
  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(0, crash_test_tablet_id_,
317
0
                                                 TABLET_DATA_READY, crash_test_timeout_ * 3));
318
0
  auto dead_leader = crash_test_leader_ts_;
319
0
  LOG(INFO) << "Dead leader: " << dead_leader->ToString();
320
0
  MonoTime start_time = MonoTime::Now();
321
0
  Status status;
322
0
  TServerDetails* new_leader;
323
0
  do {
324
0
    ASSERT_OK(FindTabletLeader(ts_map_, crash_test_tablet_id_, crash_test_timeout_, &new_leader));
325
0
    status = WaitUntilCommittedConfigNumVotersIs(5, new_leader, crash_test_tablet_id_,
326
0
                                                 MonoDelta::FromSeconds(1));
327
0
    if (status.ok()) {
328
0
      break;
329
0
    }
330
0
  } while (MonoTime::Now().GetDeltaSince(start_time).ToSeconds() < crash_test_timeout_.ToSeconds());
331
332
0
  CHECK_OK(status);
333
334
0
  start_time = MonoTime::Now();
335
0
  do {
336
0
    ASSERT_OK(FindTabletLeader(ts_map_, crash_test_tablet_id_, crash_test_timeout_,
337
0
                               &crash_test_leader_ts_));
338
339
0
    Status s = RemoveServer(new_leader, crash_test_tablet_id_, dead_leader, boost::none,
340
0
                            MonoDelta::FromSeconds(1), NULL, false /* retry */);
341
0
    if (s.ok()) {
342
0
      break;
343
0
    }
344
345
    // Ignore the return status if the leader is not ready or if the leader changed.
346
0
    if (s.ToString().find("Leader is not ready") == string::npos &&
347
0
        s.ToString().find("is not leader of this config") == string::npos) {
348
0
      CHECK_OK(s);
349
0
    }
350
351
0
    SleepFor(MonoDelta::FromMilliseconds(500));
352
0
  } while (MonoTime::Now().GetDeltaSince(start_time).ToSeconds() < crash_test_timeout_.ToSeconds());
353
354
0
  ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(4, new_leader, crash_test_tablet_id_,
355
0
                                                crash_test_timeout_));
356
357
0
  ClusterVerifier cluster_verifier(cluster_.get());
358
  // Skip cluster_verifier.CheckCluster() because it calls ListTabletServers which gets its list
359
  // from TSManager::GetAllDescriptors. This list includes the tserver that is in a crash loop, and
360
  // the check will always fail.
361
0
  ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(crash_test_workload_->table_name(),
362
0
                                                  ClusterVerifier::AT_LEAST,
363
0
                                                  crash_test_workload_->rows_inserted()));
364
365
0
  StartCrashedTabletServer();
366
0
}
367
368
0
void RemoteBootstrapITest::StartCrashedTabletServer(TabletDataState expected_data_state) {
369
  // Restore leader so it could cleanup checkpoint.
370
0
  LOG(INFO) << "Starting crashed " << crash_test_leader_index_;
371
  // Actually it is already stopped, calling shutdown to synchronize state.
372
0
  cluster_->tablet_server(crash_test_leader_index_)->Shutdown();
373
0
  ASSERT_OK(cluster_->tablet_server(crash_test_leader_index_)->Start());
374
0
  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(
375
0
      crash_test_leader_index_, crash_test_tablet_id_, expected_data_state));
376
0
}
377
378
// If a rogue (a.k.a. zombie) leader tries to remote bootstrap a tombstoned
379
// tablet, make sure its term isn't older than the latest term we observed.
380
// If it is older, make sure we reject the request, to avoid allowing old
381
// leaders to create a parallel universe. This is possible because config
382
// change could cause nodes to move around. The term check is reasonable
383
// because only one node can be elected leader for a given term.
384
//
385
// A leader can "go rogue" due to a VM pause, CTRL-z, partition, etc.
386
1
void RemoteBootstrapITest::RejectRogueLeader(YBTableType table_type) {
387
  // This test pauses for at least 10 seconds. Only run in slow-test mode.
388
1
  if (!AllowSlowTests()) {
389
1
    LOG(INFO) << "Skipping test in fast-test mode.";
390
1
    return;
391
1
  }
392
393
0
  std::vector<std::string> ts_flags = {
394
0
    "--enable_leader_failure_detection=false"s,
395
0
  };
396
0
  std::vector<std::string> master_flags = {
397
0
    "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s,
398
0
    "--use_create_table_leader_hint=false"s,
399
0
  };
400
0
  ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags));
401
402
0
  const MonoDelta timeout = MonoDelta::FromSeconds(30);
403
0
  const int kTsIndex = 0; // We'll test with the first TS.
404
0
  TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()].get();
405
406
0
  TestWorkload workload(cluster_.get());
407
0
  workload.set_sequential_write(true);
408
0
  workload.Setup(table_type);
409
410
  // Figure out the tablet id of the created tablet.
411
0
  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
412
0
  ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets));
413
0
  string tablet_id = tablets[0].tablet_status().tablet_id();
414
415
  // Wait until all replicas are up and running.
416
0
  for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) {
417
0
    ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(),
418
0
                                            tablet_id, timeout));
419
0
  }
420
421
  // Elect a leader for term 1, then run some data through the cluster.
422
0
  int zombie_leader_index = 1;
423
0
  string zombie_leader_uuid = cluster_->tablet_server(zombie_leader_index)->uuid();
424
0
  ASSERT_OK(itest::StartElection(ts_map_[zombie_leader_uuid].get(), tablet_id, timeout));
425
0
  workload.Start();
426
0
  workload.WaitInserted(100);
427
0
  workload.StopAndJoin();
428
429
0
  ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed()));
430
431
  // Come out of the blue and try to remotely bootstrap a running server while
432
  // specifying an old term. That running server should reject the request.
433
  // We are essentially masquerading as a rogue leader here.
434
0
  Status s = itest::StartRemoteBootstrap(ts, tablet_id, zombie_leader_uuid,
435
0
                                         HostPort(cluster_->tablet_server(1)->bound_rpc_addr()),
436
0
                                         0, // Say I'm from term 0.
437
0
                                         timeout);
438
0
  ASSERT_TRUE(s.IsInvalidArgument());
439
0
  ASSERT_STR_CONTAINS(s.ToString(), "term 0 lower than last logged term 1");
440
441
  // Now pause the actual leader so we can bring him back as a zombie later.
442
0
  ASSERT_OK(cluster_->tablet_server(zombie_leader_index)->Pause());
443
444
  // Trigger TS 2 to become leader of term 2.
445
0
  int new_leader_index = 2;
446
0
  string new_leader_uuid = cluster_->tablet_server(new_leader_index)->uuid();
447
0
  ASSERT_OK(itest::StartElection(ts_map_[new_leader_uuid].get(), tablet_id, timeout));
448
0
  ASSERT_OK(itest::WaitUntilLeader(ts_map_[new_leader_uuid].get(), tablet_id, timeout));
449
450
0
  auto active_ts_map = CreateTabletServerMapUnowned(ts_map_);
451
0
  ASSERT_EQ(1, active_ts_map.erase(zombie_leader_uuid));
452
453
  // Wait for the NO_OP entry from the term 2 election to propagate to the
454
  // remaining nodes' logs so that we are guaranteed to reject the rogue
455
  // leader's remote bootstrap request when we bring it back online.
456
0
  auto log_index = workload.batches_completed() + 2; // 2 terms == 2 additional NO_OP entries.
457
0
  ASSERT_OK(WaitForServersToAgree(timeout, active_ts_map, tablet_id, log_index));
458
  // TODO: Write more rows to the new leader once KUDU-1034 is fixed.
459
460
  // Now kill the new leader and tombstone the replica on TS 0.
461
0
  cluster_->tablet_server(new_leader_index)->Shutdown();
462
0
  ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout));
463
464
  // Zombies!!! Resume the rogue zombie leader.
465
  // He should attempt to remote bootstrap TS 0 but fail.
466
0
  ASSERT_OK(cluster_->tablet_server(zombie_leader_index)->Resume());
467
468
  // Loop for a few seconds to ensure that the tablet doesn't transition to READY.
469
0
  MonoTime deadline = MonoTime::Now();
470
0
  deadline.AddDelta(MonoDelta::FromSeconds(5));
471
0
  while (MonoTime::Now().ComesBefore(deadline)) {
472
0
    ASSERT_OK(itest::ListTablets(ts, timeout, &tablets));
473
0
    ASSERT_EQ(1, tablets.size());
474
0
    ASSERT_EQ(TABLET_DATA_TOMBSTONED, tablets[0].tablet_status().tablet_data_state());
475
0
    SleepFor(MonoDelta::FromMilliseconds(10));
476
0
  }
477
478
  // Force the rogue leader to step down.
479
  // Then, send a remote bootstrap start request from a "fake" leader that
480
  // sends an up-to-date term in the RB request but the actual term stored
481
  // in the bootstrap source's consensus metadata would still be old.
482
0
  LOG(INFO) << "Forcing rogue leader T " << tablet_id << " P " << zombie_leader_uuid
483
0
            << " to step down...";
484
0
  ASSERT_OK(itest::LeaderStepDown(ts_map_[zombie_leader_uuid].get(), tablet_id, nullptr, timeout));
485
0
  ExternalTabletServer* zombie_ets = cluster_->tablet_server(zombie_leader_index);
486
  // It's not necessarily part of the API but this could return faliure due to
487
  // rejecting the remote. We intend to make that part async though, so ignoring
488
  // this return value in this test.
489
0
  WARN_NOT_OK(itest::StartRemoteBootstrap(ts, tablet_id, zombie_leader_uuid,
490
0
                                          HostPort(zombie_ets->bound_rpc_addr()),
491
0
                                          2, // Say I'm from term 2.
492
0
                                          timeout),
493
0
              "Start remote bootstrap failed");
494
495
  // Wait another few seconds to be sure the remote bootstrap is rejected.
496
0
  deadline = MonoTime::Now();
497
0
  deadline.AddDelta(MonoDelta::FromSeconds(5));
498
0
  while (MonoTime::Now().ComesBefore(deadline)) {
499
0
    ASSERT_OK(itest::ListTablets(ts, timeout, &tablets));
500
0
    ASSERT_EQ(1, tablets.size());
501
0
    ASSERT_EQ(TABLET_DATA_TOMBSTONED, tablets[0].tablet_status().tablet_data_state());
502
0
    SleepFor(MonoDelta::FromMilliseconds(10));
503
0
  }
504
505
0
  ClusterVerifier cluster_verifier(cluster_.get());
506
0
  ASSERT_NO_FATALS(cluster_verifier.CheckCluster());
507
0
  ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), ClusterVerifier::EXACTLY,
508
0
      workload.rows_inserted()));
509
0
}
510
511
// Start remote bootstrap session and delete the tablet in the middle.
512
// It should actually be possible to complete bootstrap in such a case, because
513
// when a remote bootstrap session is started on the "source" server, all of
514
// the relevant files are either read or opened, meaning that an in-progress
515
// remote bootstrap can complete even after a tablet is officially "deleted" on
516
// the source server. This is also a regression test for KUDU-1009.
517
518
// For yugbayteDB we have modified this test. We no longer expect the remote bootstrap
519
// to finish successfully after the tablet has been deleted in the leader, but we do
520
// expect the leader to not crash.
521
1
void RemoteBootstrapITest::DeleteTabletDuringRemoteBootstrap(YBTableType table_type) {
522
1
  MonoDelta timeout = MonoDelta::FromSeconds(10);
523
1
  const int kTsIndex = 0; // We'll test with the first TS.
524
1
  ASSERT_NO_FATALS(StartCluster());
525
526
  // Populate a tablet with some data.
527
1
  TestWorkload workload(cluster_.get());
528
1
  workload.set_sequential_write(true);
529
1
  workload.Setup(table_type);
530
1
  workload.Start();
531
1
  workload.WaitInserted(1000);
532
533
  // Figure out the tablet id of the created tablet.
534
1
  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
535
1
  TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()].get();
536
1
  ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets));
537
0
  string tablet_id = tablets[0].tablet_status().tablet_id();
538
539
  // Ensure all the servers agree before we proceed.
540
0
  workload.StopAndJoin();
541
0
  ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed()));
542
543
  // Set up an FsManager to use with the RemoteBootstrapClient.
544
0
  FsManagerOpts opts;
545
0
  string testbase = GetTestPath("fake-ts");
546
0
  ASSERT_OK(env_->CreateDir(testbase));
547
0
  opts.wal_paths.push_back(JoinPathSegments(testbase, "wals"));
548
0
  opts.data_paths.push_back(JoinPathSegments(testbase, "data-0"));
549
0
  opts.server_type = "tserver_test";
550
0
  std::unique_ptr<FsManager> fs_manager(new FsManager(env_.get(), opts));
551
0
  ASSERT_OK(fs_manager->CreateInitialFileSystemLayout());
552
0
  ASSERT_OK(fs_manager->Open());
553
554
  // Start up a RemoteBootstrapClient and open a remote bootstrap session.
555
0
  RemoteBootstrapClient rb_client(tablet_id, fs_manager.get());
556
0
  scoped_refptr<tablet::RaftGroupMetadata> meta;
557
0
  ASSERT_OK(rb_client.Start(cluster_->tablet_server(kTsIndex)->uuid(),
558
0
                            &cluster_->proxy_cache(),
559
0
                            cluster_->tablet_server(kTsIndex)->bound_rpc_hostport(),
560
0
                            &meta));
561
562
  // Tombstone the tablet on the remote!
563
0
  ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout));
564
565
  // Now finish bootstrapping!
566
0
  tablet::TabletStatusListener listener(meta);
567
568
  // This will fail because the leader won't have any rocksdb files since they were deleted when
569
  // we called DeleteTablet.
570
0
  ASSERT_NOK(rb_client.FetchAll(&listener));
571
0
  ASSERT_OK(rb_client.EndRemoteSession());
572
0
  ASSERT_OK(rb_client.Remove());
573
574
0
  SleepFor(MonoDelta::FromMilliseconds(500));  // Give a little time for a crash (KUDU-1009).
575
0
  ASSERT_TRUE(cluster_->tablet_server(kTsIndex)->IsProcessAlive());
576
577
0
  ClusterVerifier cluster_verifier(cluster_.get());
578
0
  ASSERT_NO_FATALS(cluster_verifier.CheckCluster());
579
0
  ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), ClusterVerifier::EXACTLY,
580
0
      workload.rows_inserted()));
581
0
}
582
583
1
TEST_F(RemoteBootstrapITest, IncompleteWALDownloadDoesntCauseCrash) {
584
1
  std::vector<std::string> master_flags = {
585
1
      "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s,
586
1
      "--enable_load_balancing=false"s,
587
1
      "--use_create_table_leader_hint=false"s,
588
1
  };
589
590
1
  int constexpr kBootstrapIdleTimeoutMs = 5000;
591
592
1
  std::vector<std::string> ts_flags = {
593
1
      "--log_min_segments_to_retain=1000"s,
594
1
      "--log_min_seconds_to_retain=900"s,
595
1
      "--log_segment_size_bytes=65536"s,
596
1
      "--enable_leader_failure_detection=false"s,
597
      // Disable pre-elections since we wait for term to become 2,
598
      // that does not happen with pre-elections.
599
1
      "--use_preelection=false"s,
600
1
      "--memstore_size_mb=1"s,
601
1
      "--TEST_download_partial_wal_segments=true"s,
602
1
      "--remote_bootstrap_idle_timeout_ms="s + std::to_string(kBootstrapIdleTimeoutMs)
603
1
  };
604
605
1
  const int kNumTabletServers = 3;
606
1
  ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags, kNumTabletServers));
607
608
  // Elect a leader for term 1, then run some data through the cluster.
609
1
  const int kLeaderIndex = 1;
610
611
1
  constexpr int kNumTablets = RegularBuildVsSanitizers(10, 2);
612
1
  MonoDelta timeout = MonoDelta::FromSeconds(RegularBuildVsSanitizers(15, 45));
613
614
1
  vector<string> tablet_ids;
615
616
1
  CreateTableAssignLeaderAndWaitForTabletServersReady(YBTableType::YQL_TABLE_TYPE,
617
1
      TestWorkloadOptions::kDefaultTableName, kNumTablets, kNumTablets, kLeaderIndex, timeout,
618
1
      &tablet_ids);
619
620
1
  const int kFollowerIndex = 0;
621
1
  TServerDetails* follower_ts = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()].get();
622
623
1
  TestWorkload workload(cluster_.get());
624
1
  workload.set_write_timeout_millis(10000);
625
1
  workload.set_timeout_allowed(true);
626
1
  workload.set_write_batch_size(10);
627
1
  workload.set_num_write_threads(10);
628
1
  workload.set_sequential_write(true);
629
1
  workload.Setup(YBTableType::YQL_TABLE_TYPE);
630
1
  workload.Start();
631
1
  workload.WaitInserted(RegularBuildVsSanitizers(500, 5000));
632
1
  workload.StopAndJoin();
633
634
  // Ensure all the servers agree before we proceed.
635
0
  for (const auto& tablet_id : tablet_ids) {
636
0
    ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1));
637
0
  }
638
1
  LOG(INFO) << "All servers agree";
639
  // Now tombstone the follower.
640
1
  ASSERT_OK(itest::DeleteTablet(follower_ts, tablet_ids[0], TABLET_DATA_TOMBSTONED, boost::none,
641
1
                                timeout));
642
0
  LOG(INFO) << "Successfully sent delete request " << tablet_ids[0] << " on TS " << kFollowerIndex;
643
  // Wait until the tablet has been tombstoned on the follower.
644
0
  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_ids[0],
645
0
                                                 tablet::TABLET_DATA_TOMBSTONED, timeout));
646
0
  LOG(INFO) << "Tablet state on TS " << kFollowerIndex << " is TABLET_DATA_TOMBSTONED";
647
0
  SleepFor(MonoDelta::FromMilliseconds(5000));  // Give a little time for a crash.
648
0
  ASSERT_TRUE(cluster_->tablet_server(kFollowerIndex)->IsProcessAlive());
649
0
  LOG(INFO) << "TS " << kFollowerIndex << " didn't crash";
650
  // Now remove the crash flag, so the next replay will complete.
651
0
  ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(kFollowerIndex),
652
0
                              "TEST_download_partial_wal_segments", "false"));
653
0
  LOG(INFO) << "Successfully turned off flag TEST_download_partial_wal_segments";
654
0
  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_ids[0],
655
0
                                                 tablet::TABLET_DATA_READY, timeout * 4));
656
0
  LOG(INFO) << "Tablet " << tablet_ids[0]
657
0
            << " is in state TABLET_DATA_READY in TS " << kFollowerIndex;
658
0
  TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()].get();
659
0
  OpId op_id = ASSERT_RESULT(GetLastOpIdForReplica(
660
0
      tablet_ids[0], leader_ts, consensus::COMMITTED_OPID, timeout));
661
662
0
  auto expected_index = op_id.index;
663
664
0
  ASSERT_OK(WaitForServersToAgree(timeout,
665
0
                                  ts_map_,
666
0
                                  tablet_ids[0],
667
0
                                  expected_index,
668
0
                                  &expected_index));
669
670
0
  LOG(INFO) << "Op id index in TS " << kFollowerIndex << " is " << op_id.index
671
0
            << " for tablet " << tablet_ids[0];
672
673
0
  ClusterVerifier cluster_verifier(cluster_.get());
674
0
  ASSERT_NO_FATALS(cluster_verifier.CheckCluster());
675
0
  ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), ClusterVerifier::EXACTLY,
676
0
                                                  workload.rows_inserted()));
677
678
0
  LOG(INFO) << "Cluster verifier succeeded";
679
680
  // Sleep to make sure all the remote bootstrap sessions that were initiated but
681
  // not completed are expired.
682
0
  SleepFor(MonoDelta::FromMilliseconds(kBootstrapIdleTimeoutMs * 2));
683
0
}
684
685
// This test ensures that a leader can remote-bootstrap a tombstoned replica
686
// that has a higher term recorded in the replica's consensus metadata if the
687
// replica's last-logged opid has the same term (or less) as the leader serving
688
// as the remote bootstrap source. When a tablet is tombstoned, its last-logged
689
// opid is stored in a field its on-disk superblock.
690
1
void RemoteBootstrapITest::RemoteBootstrapFollowerWithHigherTerm(YBTableType table_type) {
691
1
  std::vector<std::string> ts_flags = {
692
1
    "--enable_leader_failure_detection=false"s,
693
    // Disable pre-elections since we wait for term to become 2,
694
    // that does not happen with pre-elections
695
1
    "--use_preelection=false"s
696
1
  };
697
698
1
  std::vector<std::string> master_flags = {
699
1
    "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s,
700
1
    "--replication_factor=2"s,
701
1
    "--use_create_table_leader_hint=false"s,
702
1
  };
703
704
1
  const int kNumTabletServers = 2;
705
1
  ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags, kNumTabletServers));
706
707
1
  const MonoDelta timeout = MonoDelta::FromSeconds(30);
708
1
  const int kFollowerIndex = 0;
709
1
  TServerDetails* follower_ts = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()].get();
710
711
1
  TestWorkload workload(cluster_.get());
712
1
  workload.set_sequential_write(true);
713
1
  workload.Setup(table_type);
714
715
  // Figure out the tablet id of the created tablet.
716
1
  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
717
1
  ASSERT_OK(WaitForNumTabletsOnTS(follower_ts, 1, timeout, &tablets));
718
1
  string tablet_id = tablets[0].tablet_status().tablet_id();
719
720
  // Wait until all replicas are up and running.
721
3
  for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) {
722
2
    ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(),
723
2
                                            tablet_id, timeout));
724
2
  }
725
726
  // Elect a leader for term 1, then run some data through the cluster.
727
1
  const int kLeaderIndex = 1;
728
1
  TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()].get();
729
1
  ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout));
730
1
  workload.Start();
731
1
  workload.WaitInserted(100);
732
1
  workload.StopAndJoin();
733
734
1
  ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed()));
735
736
  // Pause the leader and increment the term on the follower by starting an
737
  // election on the follower. The election will fail asynchronously but we
738
  // just wait until we see that its term has incremented.
739
0
  ASSERT_OK(cluster_->tablet_server(kLeaderIndex)->Pause());
740
0
  ASSERT_OK(itest::StartElection(
741
0
      follower_ts, tablet_id, timeout, consensus::TEST_SuppressVoteRequest::kTrue));
742
0
  int64_t term = 0;
743
0
  for (int i = 0; i < 1000; i++) {
744
0
    consensus::ConsensusStatePB cstate;
745
0
    ASSERT_OK(itest::GetConsensusState(follower_ts, tablet_id, CONSENSUS_CONFIG_COMMITTED,
746
0
                                       timeout, &cstate));
747
0
    term = cstate.current_term();
748
0
    if (term == 2) break;
749
0
    SleepFor(MonoDelta::FromMilliseconds(10));
750
0
  }
751
0
  ASSERT_EQ(2, term);
752
753
  // Now tombstone the follower.
754
0
  ASSERT_OK(itest::DeleteTablet(follower_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none,
755
0
                                timeout));
756
757
  // Wait until the tablet has been tombstoned on the follower.
758
0
  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id,
759
0
                                                 tablet::TABLET_DATA_TOMBSTONED, timeout));
760
761
  // Now wake the leader. It should detect that the follower needs to be
762
  // remotely bootstrapped and proceed to bring it back up to date.
763
0
  ASSERT_OK(cluster_->tablet_server(kLeaderIndex)->Resume());
764
765
  // Wait for remote bootstrap to complete successfully.
766
0
  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id,
767
0
                                                 tablet::TABLET_DATA_READY, timeout));
768
769
  // Wait for the follower to come back up.
770
0
  ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed()));
771
772
0
  ClusterVerifier cluster_verifier(cluster_.get());
773
  // During this test we disable leader failure detection.
774
  // So we use CONSISTENT_PREFIX for verification because it could end up w/o leader at all. We also
775
  // don't call CheckCluster for this reason.
776
0
  ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(
777
0
      workload.table_name(), ClusterVerifier::EXACTLY, workload.rows_inserted(),
778
0
      YBConsistencyLevel::CONSISTENT_PREFIX));
779
0
}
780
781
void RemoteBootstrapITest::CreateTableAssignLeaderAndWaitForTabletServersReady(
782
    const YBTableType table_type, const YBTableName& table_name, const int num_tablets,
783
    const int expected_num_tablets_per_ts, const int leader_index, const MonoDelta& timeout,
784
4
    vector<string>* tablet_ids) {
785
786
4
  ASSERT_OK(client_->CreateNamespaceIfNotExists(table_name.namespace_name()));
787
788
  // Create a table with several tablets. These will all be simultaneously
789
  // remotely bootstrapped to a single target node from the same leader host.
790
4
  YBSchema client_schema(YBSchemaFromSchema(GetSimpleTestSchema()));
791
4
  std::unique_ptr<YBTableCreator> table_creator(client_->NewTableCreator());
792
4
  ASSERT_OK(table_creator->table_name(table_name)
793
4
                .num_tablets(num_tablets)
794
4
                .schema(&client_schema)
795
4
                .table_type(table_type)
796
4
                .Create());
797
798
4
  TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()].get();
799
800
  // Figure out the tablet ids of the created tablets.
801
4
  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
802
4
  ASSERT_OK(WaitForNumTabletsOnTS(ts, expected_num_tablets_per_ts, timeout, &tablets));
803
804
58
  for (const ListTabletsResponsePB::StatusAndSchemaPB& t : tablets) {
805
58
    tablet_ids->push_back(t.tablet_status().tablet_id());
806
58
  }
807
808
  // Wait until all replicas are up and running.
809
16
  for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) {
810
174
    for (const string& tablet_id : *tablet_ids) {
811
174
      ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(),
812
174
                                              tablet_id, timeout));
813
174
    }
814
12
  }
815
816
  // Elect leaders on each tablet for term 1. All leaders will be on TS leader_index.
817
4
  const string kLeaderUuid = cluster_->tablet_server(leader_index)->uuid();
818
58
  for (const string& tablet_id : *tablet_ids) {
819
58
    ASSERT_OK(itest::StartElection(ts_map_[kLeaderUuid].get(), tablet_id, timeout));
820
58
  }
821
822
58
  for (const string& tablet_id : *tablet_ids) {
823
58
    TServerDetails* leader_ts = nullptr;
824
58
    ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, timeout, &leader_ts));
825
58
    ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(3, leader_ts, tablet_id, timeout));
826
58
  }
827
4
}
828
829
// Test that multiple concurrent remote bootstraps do not cause problems.
830
// This is a regression test for KUDU-951, in which concurrent sessions on
831
// multiple tablets between the same remote bootstrap client host and remote
832
// bootstrap source host could corrupt each other.
833
1
void RemoteBootstrapITest::ConcurrentRemoteBootstraps(YBTableType table_type) {
834
1
  if (!AllowSlowTests()) {
835
1
    LOG(INFO) << "Skipping test in fast-test mode.";
836
1
    return;
837
1
  }
838
839
0
  std::vector<std::string> ts_flags = {
840
0
    "--enable_leader_failure_detection=false"s,
841
0
    "--log_cache_size_limit_mb=1"s,
842
0
    "--log_segment_size_mb=1"s,
843
0
    "--log_async_preallocate_segments=false"s,
844
0
    "--log_min_segments_to_retain=100"s,
845
0
    "--maintenance_manager_polling_interval_ms=10"s,
846
0
  };
847
0
  std::vector<std::string> master_flags = {
848
0
    "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s,
849
0
    "--enable_load_balancing=false"s,
850
0
    "--use_create_table_leader_hint=false"s,
851
0
  };
852
0
  ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags));
853
854
0
  const int kNumTablets = 10;
855
0
  const int kLeaderIndex = 1;
856
0
  const MonoDelta timeout = MonoDelta::FromSeconds(FLAGS_remote_bootstrap_itest_timeout_sec);
857
0
  vector<string> tablet_ids;
858
859
0
  CreateTableAssignLeaderAndWaitForTabletServersReady(table_type,
860
0
      TestWorkloadOptions::kDefaultTableName, kNumTablets, kNumTablets, kLeaderIndex, timeout,
861
0
      &tablet_ids);
862
863
0
  TestWorkload workload(cluster_.get());
864
0
  workload.set_write_timeout_millis(10000);
865
0
  workload.set_timeout_allowed(true);
866
0
  workload.set_write_batch_size(10);
867
0
  workload.set_num_write_threads(10);
868
0
  workload.Setup(table_type);
869
0
  workload.Start();
870
0
  workload.WaitInserted(20000);
871
0
  workload.StopAndJoin();
872
873
0
  for (const string& tablet_id : tablet_ids) {
874
0
    ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1));
875
0
  }
876
877
  // Now pause the leader so we can tombstone the tablets.
878
0
  ASSERT_OK(cluster_->tablet_server(kLeaderIndex)->Pause());
879
880
0
  const int kTsIndex = 0; // We'll test with the first TS.
881
0
  TServerDetails* target_ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()].get();
882
883
0
  for (const string& tablet_id : tablet_ids) {
884
0
    LOG(INFO) << "Tombstoning tablet " << tablet_id << " on TS " << target_ts->uuid();
885
0
    ASSERT_OK(itest::DeleteTablet(target_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none,
886
0
                                  MonoDelta::FromSeconds(10)));
887
0
  }
888
889
  // Unpause the leader TS and wait for it to remotely bootstrap the tombstoned
890
  // tablets, in parallel.
891
0
  ASSERT_OK(cluster_->tablet_server(kLeaderIndex)->Resume());
892
0
  for (const string& tablet_id : tablet_ids) {
893
0
    ASSERT_OK(itest::WaitUntilTabletRunning(target_ts, tablet_id, timeout));
894
0
  }
895
896
0
  ClusterVerifier cluster_verifier(cluster_.get());
897
0
  ASSERT_NO_FATALS(cluster_verifier.CheckCluster());
898
0
  ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST,
899
0
                            workload.rows_inserted()));
900
0
}
901
902
1
TEST_F(RemoteBootstrapITest, TestLimitNumberOfConcurrentRemoteBootstraps) {
903
1
  constexpr int kMaxConcurrentTabletRemoteBootstrapSessions = 5;
904
1
  constexpr int kMaxConcurrentTabletRemoteBootstrapSessionsPerTable = 2;
905
906
1
  vector<string> ts_flags;
907
1
  int follower_considered_failed_sec;
908
1
  follower_considered_failed_sec = 10;
909
1
  ts_flags.push_back("--follower_unavailable_considered_failed_sec="+
910
1
                     std::to_string(follower_considered_failed_sec));
911
1
  ts_flags.push_back("--heartbeat_interval_ms=100");
912
1
  ts_flags.push_back("--enable_leader_failure_detection=false");
913
1
  ts_flags.push_back("--TEST_crash_if_remote_bootstrap_sessions_greater_than=" +
914
1
      std::to_string(kMaxConcurrentTabletRemoteBootstrapSessions + 1));
915
1
  ts_flags.push_back("--TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than=" +
916
1
      std::to_string(kMaxConcurrentTabletRemoteBootstrapSessionsPerTable + 1));
917
1
  ts_flags.push_back("--TEST_simulate_long_remote_bootstrap_sec=5");
918
919
1
  std::vector<std::string> master_flags = {
920
1
    "--TEST_load_balancer_handle_under_replicated_tablets_only=true"s,
921
1
    "--load_balancer_max_concurrent_tablet_remote_bootstraps=" +
922
1
        std::to_string(kMaxConcurrentTabletRemoteBootstrapSessions),
923
1
    "--load_balancer_max_concurrent_tablet_remote_bootstraps_per_table=" +
924
1
        std::to_string(kMaxConcurrentTabletRemoteBootstrapSessionsPerTable),
925
1
    "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s,
926
    // This value has to be less than follower_considered_failed_sec.
927
1
    "--tserver_unresponsive_timeout_ms=8000"s,
928
1
    "--use_create_table_leader_hint=false"s,
929
1
  };
930
931
1
  ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags));
932
933
1
  const MonoDelta timeout = MonoDelta::FromSeconds(FLAGS_remote_bootstrap_itest_timeout_sec);
934
1
  const int kLeaderIndex = 1;
935
1
  const int kNumberTables = 3;
936
1
  const int kNumTablets = 8;
937
1
  vector<vector<string>> tablet_ids;
938
939
4
  for (int i = 0; i < kNumberTables; i++) {
940
3
    tablet_ids.push_back(vector<string>());
941
3
    std::string table_name_str = "table_test_" + std::to_string(i);
942
3
    std::string keyspace_name_str = "keyspace_test_" + std::to_string(i);
943
3
    YBTableName table_name(YQL_DATABASE_CQL, keyspace_name_str, table_name_str);
944
3
    CreateTableAssignLeaderAndWaitForTabletServersReady(YBTableType::YQL_TABLE_TYPE,
945
3
        table_name, kNumTablets, (i + 1) * kNumTablets, kLeaderIndex, timeout, &(tablet_ids[i]));
946
3
  }
947
948
1
  const int kTsIndex = 0; // We'll test with the first TS.
949
950
  // Now pause the first tserver so that it gets removed from the configuration for all of the
951
  // tablets.
952
1
  ASSERT_OK(cluster_->tablet_server(kTsIndex)->Pause());
953
1
  LOG(INFO) << "Paused tserver " << cluster_->tablet_server(kTsIndex)->uuid();
954
955
  // Sleep for longer than FLAGS_follower_unavailable_considered_failed_sec to guarantee that the
956
  // other peers in the config for each tablet removes this tserver from the raft config.
957
1
  int total_time_slept = 0;
958
21
  while (total_time_slept < follower_considered_failed_sec * 2) {
959
20
    SleepFor(MonoDelta::FromSeconds(1));
960
20
    total_time_slept++;
961
20
    LOG(INFO) << "Total time slept " << total_time_slept;
962
20
  }
963
964
965
  // Resume the tserver. The cluster balancer will ensure that all the tablets are added back to
966
  // this tserver, and it will cause the leader to start remote bootstrap sessions for all of the
967
  // tablets. FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than will make sure that we
968
  // never have more than the expected number of concurrent remote bootstrap sessions.
969
1
  ASSERT_OK(cluster_->tablet_server(kTsIndex)->Resume());
970
971
1
  LOG(INFO) << "Tserver " << cluster_->tablet_server(kTsIndex)->uuid() << " resumed";
972
973
  // Wait until the config for all the tablets have three voters. This means that the tserver that
974
  // we just resumed was remote bootstrapped correctly.
975
4
  for (int i = 0; i < kNumberTables; i++) {
976
48
    for (const string& tablet_id : tablet_ids[i]) {
977
48
        TServerDetails* leader_ts = nullptr;
978
48
        ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, timeout, &leader_ts));
979
48
        LOG(INFO) << "Waiting until config has 3 voters for tablet " << tablet_id;
980
48
        ASSERT_OK(itest::WaitUntilCommittedConfigNumVotersIs(3, leader_ts, tablet_id, timeout));
981
48
      }
982
3
  }
983
1
}
984
985
// Test that repeatedly runs a load, tombstones a follower, then tombstones the
986
// leader while the follower is remotely bootstrapping. Regression test for
987
// KUDU-1047.
988
1
void RemoteBootstrapITest::DeleteLeaderDuringRemoteBootstrapStressTest(YBTableType table_type) {
989
  // This test takes a while due to failure detection.
990
1
  if (!AllowSlowTests()) {
991
1
    LOG(INFO) << "Skipping test in fast-test mode.";
992
1
    return;
993
1
  }
994
995
0
  const MonoDelta timeout = MonoDelta::FromSeconds(FLAGS_remote_bootstrap_itest_timeout_sec);
996
0
  vector<string> master_flags;
997
0
  master_flags.push_back("--replication_factor=5");
998
0
  ASSERT_NO_FATALS(StartCluster(vector<string>(), master_flags, 5));
999
1000
0
  TestWorkload workload(cluster_.get());
1001
0
  workload.set_payload_bytes(FLAGS_test_delete_leader_payload_bytes);
1002
0
  workload.set_num_write_threads(FLAGS_test_delete_leader_num_writer_threads);
1003
0
  workload.set_write_batch_size(1);
1004
0
  workload.set_write_timeout_millis(10000);
1005
0
  workload.set_timeout_allowed(true);
1006
0
  workload.set_not_found_allowed(true);
1007
0
  workload.set_sequential_write(true);
1008
0
  workload.Setup(table_type);
1009
1010
  // Figure out the tablet id.
1011
0
  const int kTsIndex = 0;
1012
0
  TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()].get();
1013
0
  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
1014
0
  ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets));
1015
0
  string tablet_id = tablets[0].tablet_status().tablet_id();
1016
1017
  // Wait until all replicas are up and running.
1018
0
  for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) {
1019
0
    ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(),
1020
0
                                            tablet_id, timeout));
1021
0
  }
1022
1023
0
  int leader_index = -1;
1024
0
  int follower_index = -1;
1025
0
  TServerDetails* leader_ts = nullptr;
1026
0
  TServerDetails* follower_ts = nullptr;
1027
1028
0
  for (int i = 0; i < FLAGS_test_delete_leader_num_iters; i++) {
1029
0
    LOG(INFO) << "Iteration " << (i + 1);
1030
0
    auto rows_previously_inserted = workload.rows_inserted();
1031
1032
    // Find out who's leader.
1033
0
    ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, timeout, &leader_ts));
1034
0
    leader_index = cluster_->tablet_server_index_by_uuid(leader_ts->uuid());
1035
1036
    // Select an arbitrary follower.
1037
0
    follower_index = (leader_index + 1) % cluster_->num_tablet_servers();
1038
0
    follower_ts = ts_map_[cluster_->tablet_server(follower_index)->uuid()].get();
1039
1040
    // Spin up the workload.
1041
0
    workload.Start();
1042
0
    while (workload.rows_inserted() < rows_previously_inserted +
1043
0
                                      FLAGS_test_delete_leader_min_rows_per_iter) {
1044
0
      SleepFor(MonoDelta::FromMilliseconds(10));
1045
0
    }
1046
1047
    // Tombstone the follower.
1048
0
    LOG(INFO) << "Tombstoning follower tablet " << tablet_id << " on TS " << follower_ts->uuid();
1049
0
    ASSERT_OK(itest::DeleteTablet(follower_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none,
1050
0
                                  timeout));
1051
1052
    // Wait for remote bootstrap to start.
1053
    // ENG-81: There is a frequent race condition here: if the bootstrap happens too quickly, we can
1054
    // see TABLET_DATA_READY right away without seeing TABLET_DATA_COPYING first (at last that's a
1055
    // working hypothesis of an explanation). In an attempt to remedy this, we have increased the
1056
    // number of rows inserted per iteration from 20 to 200.
1057
0
    ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(follower_index, tablet_id,
1058
0
                                                   tablet::TABLET_DATA_COPYING, timeout));
1059
1060
    // Tombstone the leader.
1061
0
    LOG(INFO) << "Tombstoning leader tablet " << tablet_id << " on TS " << leader_ts->uuid();
1062
0
    ASSERT_OK(itest::DeleteTablet(leader_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none,
1063
0
                                  timeout));
1064
1065
    // Quiesce and rebuild to full strength. This involves electing a new
1066
    // leader from the remaining three, which requires a unanimous vote, and
1067
    // that leader then remotely bootstrapping the old leader.
1068
0
    workload.StopAndJoin();
1069
0
    ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1));
1070
0
  }
1071
1072
0
  ClusterVerifier cluster_verifier(cluster_.get());
1073
0
  ASSERT_NO_FATALS(cluster_verifier.CheckCluster());
1074
0
  ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST,
1075
0
                            workload.rows_inserted()));
1076
0
}
1077
1078
namespace {
1079
0
int64_t CountUpdateConsensusCalls(ExternalTabletServer* ets, const string& tablet_id) {
1080
0
  return CHECK_RESULT(ets->GetInt64Metric(
1081
0
      &METRIC_ENTITY_server,
1082
0
      "yb.tabletserver",
1083
0
      &METRIC_handler_latency_yb_consensus_ConsensusService_UpdateConsensus,
1084
0
      "total_count"));
1085
0
}
1086
0
int64_t CountLogMessages(ExternalTabletServer* ets) {
1087
0
  int64_t total = 0;
1088
1089
0
  total += CHECK_RESULT(ets->GetInt64Metric(
1090
0
      &METRIC_ENTITY_server,
1091
0
      "yb.tabletserver",
1092
0
      &METRIC_glog_info_messages,
1093
0
      "value"));
1094
1095
0
  total += CHECK_RESULT(ets->GetInt64Metric(
1096
0
      &METRIC_ENTITY_server,
1097
0
      "yb.tabletserver",
1098
0
      &METRIC_glog_warning_messages,
1099
0
      "value"));
1100
1101
0
  total += CHECK_RESULT(ets->GetInt64Metric(
1102
0
      &METRIC_ENTITY_server,
1103
0
      "yb.tabletserver",
1104
0
      &METRIC_glog_error_messages,
1105
0
      "value"));
1106
1107
0
  return total;
1108
0
}
1109
} // anonymous namespace
1110
1111
// Test that if remote bootstrap is disabled by a flag, we don't get into
1112
// tight loops after a tablet is deleted. This is a regression test for situation
1113
// similar to the bug described in KUDU-821: we were previously handling a missing
1114
// tablet within consensus in such a way that we'd immediately send another RPC.
1115
void RemoteBootstrapITest::DisableRemoteBootstrap_NoTightLoopWhenTabletDeleted(
1116
1
    YBTableType table_type) {
1117
1118
1
  MonoDelta timeout = MonoDelta::FromSeconds(10);
1119
1
  std::vector<string> ts_flags = {
1120
1
    "--enable_leader_failure_detection=false"s,
1121
1
    "--TEST_enable_remote_bootstrap=false"s,
1122
1
    "--rpc_slow_query_threshold_ms=10000000"s,
1123
1
  };
1124
1
  std::vector<std::string> master_flags = {
1125
1
    "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s,
1126
1
    "--use_create_table_leader_hint=false"s,
1127
1
  };
1128
1
  ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags));
1129
1130
1
  TestWorkload workload(cluster_.get());
1131
  // TODO(KUDU-1054): the client should handle retrying on different replicas
1132
  // if the tablet isn't found, rather than giving us this error.
1133
1
  workload.set_not_found_allowed(true);
1134
1
  workload.set_write_batch_size(1);
1135
1
  workload.Setup(table_type);
1136
1137
  // Figure out the tablet id of the created tablet.
1138
1
  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
1139
1
  ExternalTabletServer* replica_ets = cluster_->tablet_server(1);
1140
1
  TServerDetails* replica_ts = ts_map_[replica_ets->uuid()].get();
1141
1
  ASSERT_OK(WaitForNumTabletsOnTS(replica_ts, 1, timeout, &tablets));
1142
0
  string tablet_id = tablets[0].tablet_status().tablet_id();
1143
1144
  // Wait until all replicas are up and running.
1145
0
  for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) {
1146
0
    ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(),
1147
0
                                            tablet_id, timeout));
1148
0
  }
1149
1150
  // Elect a leader (TS 0)
1151
0
  ExternalTabletServer* leader_ts = cluster_->tablet_server(0);
1152
0
  ASSERT_OK(itest::StartElection(ts_map_[leader_ts->uuid()].get(), tablet_id, timeout));
1153
1154
  // Start writing, wait for some rows to be inserted.
1155
0
  workload.Start();
1156
0
  workload.WaitInserted(100);
1157
1158
  // Tombstone the tablet on one of the servers (TS 1)
1159
0
  ASSERT_OK(itest::DeleteTablet(replica_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none,
1160
0
                                timeout));
1161
1162
  // Ensure that, if we sleep for a second while still doing writes to the leader:
1163
  // a) we don't spew logs on the leader side
1164
  // b) we don't get hit with a lot of UpdateConsensus calls on the replica.
1165
0
  int64_t num_update_rpcs_initial = CountUpdateConsensusCalls(replica_ets, tablet_id);
1166
0
  int64_t num_logs_initial = CountLogMessages(leader_ts);
1167
1168
0
  SleepFor(MonoDelta::FromSeconds(1));
1169
0
  int64_t num_update_rpcs_after_sleep = CountUpdateConsensusCalls(replica_ets, tablet_id);
1170
0
  int64_t num_logs_after_sleep = CountLogMessages(leader_ts);
1171
1172
  // Calculate rate per second of RPCs and log messages
1173
0
  int64_t update_rpcs_per_second = num_update_rpcs_after_sleep - num_update_rpcs_initial;
1174
0
  EXPECT_LT(update_rpcs_per_second, 20);
1175
0
  int64_t num_logs_per_second = num_logs_after_sleep - num_logs_initial;
1176
0
  EXPECT_LT(num_logs_per_second, 20);
1177
0
}
1178
1179
1
void RemoteBootstrapITest::LeaderCrashesWhileFetchingData(YBTableType table_type) {
1180
1
  crash_test_timeout_ = MonoDelta::FromSeconds(40);
1181
1
  CrashTestSetUp(table_type);
1182
1183
  // Cause the leader to crash when a follower tries to fetch data from it.
1184
1
  const string& fault_flag = "TEST_fault_crash_on_handle_rb_fetch_data";
1185
1
  ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(crash_test_leader_index_), fault_flag,
1186
1
                              "1.0"));
1187
1188
  // Add our TS 0 to the config and wait for the leader to crash.
1189
0
  ASSERT_OK(cluster_->tablet_server(crash_test_tserver_index_)->Restart());
1190
0
  TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()].get();
1191
1192
0
  ASSERT_OK(itest::AddServer(crash_test_leader_ts_, crash_test_tablet_id_, ts,
1193
0
                             PeerMemberType::PRE_VOTER, boost::none, crash_test_timeout_,
1194
0
                             NULL /* error code */,
1195
0
                             true /* retry */));
1196
1197
0
  ASSERT_OK(cluster_->WaitForTSToCrash(crash_test_leader_index_, kWaitForCrashTimeout_));
1198
1199
0
  CrashTestVerify();
1200
0
}
1201
1202
1
void RemoteBootstrapITest::LeaderCrashesBeforeChangeRole(YBTableType table_type) {
1203
  // Make the tablet server sleep in LogAndTombstone after it has called DeleteTabletData so we can
1204
  // verify that the tablet has been tombstoned (by calling WaitForTabletDataStateOnTs).
1205
1
  crash_test_tserver_flags_.push_back("--TEST_sleep_after_tombstoning_tablet_secs=5");
1206
1
  crash_test_timeout_ = MonoDelta::FromSeconds(40);
1207
1
  CrashTestSetUp(table_type);
1208
1209
  // Cause the leader to crash when the follower ends the remote bootstrap session and just before
1210
  // the leader is about to change the role of the follower.
1211
1
  const string& fault_flag = "TEST_fault_crash_leader_before_changing_role";
1212
1
  ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(crash_test_leader_index_), fault_flag,
1213
1
                              "1.0"));
1214
1215
  // Add our TS 0 to the config and wait for the leader to crash.
1216
0
  ASSERT_OK(cluster_->tablet_server(crash_test_tserver_index_)->Restart());
1217
0
  TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()].get();
1218
0
  ASSERT_OK(itest::AddServer(crash_test_leader_ts_, crash_test_tablet_id_, ts,
1219
0
                             PeerMemberType::PRE_VOTER, boost::none, crash_test_timeout_));
1220
0
  ASSERT_OK(cluster_->WaitForTSToCrash(crash_test_leader_index_, kWaitForCrashTimeout_));
1221
0
  CrashTestVerify();
1222
0
}
1223
1224
1
void RemoteBootstrapITest::LeaderCrashesAfterChangeRole(YBTableType table_type) {
1225
  // Make the tablet server sleep in LogAndTombstone after it has called DeleteTabletData so we can
1226
  // verify that the tablet has been tombstoned (by calling WaitForTabletDataStateOnTs).
1227
1
  crash_test_tserver_flags_.push_back("--TEST_sleep_after_tombstoning_tablet_secs=5");
1228
1
  crash_test_timeout_ = MonoDelta::FromSeconds(40);
1229
1
  CrashTestSetUp(table_type);
1230
1231
  // Cause the leader to crash after it has successfully sent a ChangeConfig CHANGE_ROLE request and
1232
  // before it responds to the EndRemoteBootstrapSession request.
1233
1
  const string& fault_flag = "TEST_fault_crash_leader_after_changing_role";
1234
1
  ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(crash_test_leader_index_), fault_flag,
1235
1
                              "1.0"));
1236
1237
  // Add our TS 0 to the config and wait for the leader to crash.
1238
0
  ASSERT_OK(cluster_->tablet_server(crash_test_tserver_index_)->Restart());
1239
0
  TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()].get();
1240
0
  ASSERT_OK(itest::AddServer(crash_test_leader_ts_, crash_test_tablet_id_, ts,
1241
0
                             PeerMemberType::PRE_VOTER, boost::none, crash_test_timeout_));
1242
0
  ASSERT_OK(cluster_->WaitForTSToCrash(crash_test_leader_index_, kWaitForCrashTimeout_));
1243
1244
0
  CrashTestVerify();
1245
0
}
1246
1247
1
void RemoteBootstrapITest::ClientCrashesBeforeChangeRole(YBTableType table_type) {
1248
1
  crash_test_timeout_ = MonoDelta::FromSeconds(40);
1249
1
  crash_test_tserver_flags_.push_back("--TEST_return_error_on_change_config=0.60");
1250
1
  CrashTestSetUp(table_type);
1251
1252
  // Add our TS 0 to the config and wait for it to crash.
1253
1
  ASSERT_OK(cluster_->tablet_server(crash_test_tserver_index_)->Restart());
1254
  // Cause the newly added tserver to crash after the transfer of files for remote bootstrap has
1255
  // completed but before ending the session with the leader to avoid triggering a ChangeConfig
1256
  // in the leader.
1257
0
  const string& fault_flag = "TEST_fault_crash_bootstrap_client_before_changing_role";
1258
0
  ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(crash_test_tserver_index_), fault_flag,
1259
0
                              "1.0"));
1260
1261
0
  TServerDetails* ts = ts_map_[cluster_->tablet_server(crash_test_tserver_index_)->uuid()].get();
1262
0
  ASSERT_OK(itest::AddServer(crash_test_leader_ts_, crash_test_tablet_id_, ts,
1263
0
                             PeerMemberType::PRE_VOTER, boost::none, crash_test_timeout_));
1264
1265
0
  ASSERT_OK(cluster_->WaitForTSToCrash(crash_test_tserver_index_, kWaitForCrashTimeout_));
1266
1267
0
  LOG(INFO) << "Restarting TS " << cluster_->tablet_server(crash_test_tserver_index_)->uuid();
1268
0
  cluster_->tablet_server(crash_test_tserver_index_)->Shutdown();
1269
0
  ASSERT_OK(cluster_->tablet_server(crash_test_tserver_index_)->Restart());
1270
1271
0
  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(crash_test_tserver_index_, crash_test_tablet_id_,
1272
0
                                                 TABLET_DATA_READY, crash_test_timeout_));
1273
1274
0
  ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(5, crash_test_leader_ts_, crash_test_tablet_id_,
1275
0
                                                crash_test_timeout_));
1276
1277
0
  ClusterVerifier cluster_verifier(cluster_.get());
1278
0
  ASSERT_NO_FATALS(cluster_verifier.CheckCluster());
1279
0
  ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(crash_test_workload_->table_name(),
1280
0
                                                  ClusterVerifier::AT_LEAST,
1281
0
                                                  crash_test_workload_->rows_inserted()));
1282
1283
1284
  // We know that the remote bootstrap checkpoint on the leader hasn't been cleared because it
1285
  // hasn't restarted. So we don't want to check that it's empty.
1286
0
  check_checkpoints_cleared_ = false;
1287
0
}
1288
1289
1
TEST_F(RemoteBootstrapITest, TestVeryLongRemoteBootstrap) {
1290
1
  vector<string> ts_flags, master_flags;
1291
1292
  // Make everything happen 100x faster:
1293
  //  - follower_unavailable_considered_failed_sec from 300 to 3 secs
1294
  //  - raft_heartbeat_interval_ms from 500 to 5 ms
1295
  //  - consensus_rpc_timeout_ms from 3000 to 30 ms
1296
1297
1
  ts_flags.push_back("--follower_unavailable_considered_failed_sec=3");
1298
1
  ts_flags.push_back("--raft_heartbeat_interval_ms=5");
1299
1
  ts_flags.push_back("--consensus_rpc_timeout_ms=30");
1300
1301
  // Increase the number of missed heartbeats used to detect leader failure since in slow testing
1302
  // instances it is very easy to miss the default (6) heartbeats since they are being sent very
1303
  // fast (5ms).
1304
1
  ts_flags.push_back("--leader_failure_max_missed_heartbeat_periods=40.0");
1305
1306
  // Make the remote bootstrap take longer than follower_unavailable_considered_failed_sec seconds
1307
  // so the peer gets removed from the config while it is being remote bootstrapped.
1308
1
  ts_flags.push_back("--TEST_simulate_long_remote_bootstrap_sec=5");
1309
1310
1
  master_flags.push_back("--enable_load_balancing=false");
1311
1312
1
  ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags, 4));
1313
1314
  // We'll do a config change to remote bootstrap a replica here later. For now, shut it down.
1315
1
  auto constexpr kTsIndex = 0;
1316
1
  LOG(INFO) << "Shutting down TS " << cluster_->tablet_server(kTsIndex)->uuid();
1317
1
  cluster_->tablet_server(kTsIndex)->Shutdown();
1318
1
  auto new_ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()].get();
1319
1320
  // Bounce the Master so it gets new tablet reports and doesn't try to assign a replica to the
1321
  // dead TS.
1322
1
  const auto timeout = MonoDelta::FromSeconds(40);
1323
1
  cluster_->master()->Shutdown();
1324
1
  LOG(INFO) << "Restarting master " << cluster_->master()->uuid();
1325
1
  ASSERT_OK(cluster_->master()->Restart());
1326
1
  ASSERT_OK(cluster_->WaitForTabletServerCount(3, timeout));
1327
1328
  // Populate a tablet with some data.
1329
1
  LOG(INFO)  << "Starting workload";
1330
1
  TestWorkload workload(cluster_.get());
1331
1
  workload.set_sequential_write(true);
1332
1
  workload.Setup(YBTableType::YQL_TABLE_TYPE);
1333
1
  workload.Start();
1334
1
  workload.WaitInserted(10);
1335
1
  LOG(INFO) << "Stopping workload";
1336
1
  workload.StopAndJoin();
1337
1338
  // Figure out the tablet id of the created tablet.
1339
1
  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
1340
1
  TServerDetails* ts = ts_map_[cluster_->tablet_server(1)->uuid()].get();
1341
1
  ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets));
1342
0
  string tablet_id = tablets[0].tablet_status().tablet_id();
1343
1344
0
  TServerDetails* leader_ts;
1345
  // Find out who's leader.
1346
0
  ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, timeout, &leader_ts));
1347
1348
  // Add back TS0.
1349
0
  ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart());
1350
0
  LOG(INFO) << "Adding tserver with uuid " << new_ts->uuid();
1351
0
  ASSERT_OK(itest::AddServer(leader_ts, tablet_id, new_ts, PeerMemberType::PRE_VOTER, boost::none,
1352
0
                             timeout));
1353
  // After adding  new_ts, the leader will detect that TS0 needs to be remote bootstrapped. Verify
1354
  // that this process completes successfully.
1355
0
  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY));
1356
0
  LOG(INFO) << "Tablet " << tablet_id << " in state TABLET_DATA_READY in tablet server "
1357
0
              << new_ts->uuid();
1358
1359
0
  ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(4, leader_ts, tablet_id, timeout));
1360
0
  LOG(INFO) << "Number of voters for tablet " << tablet_id << " is 4";
1361
1362
  // Ensure all the servers agree before we proceed.
1363
0
  ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed()));
1364
1365
0
  ClusterVerifier cluster_verifier(cluster_.get());
1366
0
  ASSERT_NO_FATALS(cluster_verifier.CheckCluster());
1367
0
  ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(),
1368
0
                                                  ClusterVerifier::AT_LEAST,
1369
0
                                                  workload.rows_inserted()));
1370
0
}
1371
1372
1
TEST_F(RemoteBootstrapITest, TestFailedTabletIsRemoteBootstrapped) {
1373
1
  std::vector<std::string> ts_flags = {
1374
1
      "--follower_unavailable_considered_failed_sec=30",
1375
1
      "--raft_heartbeat_interval_ms=50",
1376
1
      "--consensus_rpc_timeout_ms=300",
1377
1
      "--TEST_delay_removing_peer_with_failed_tablet_secs=10",
1378
1
      "--memstore_size_mb=1",
1379
      // Increase the number of missed heartbeats used to detect leader failure since in slow
1380
      // testing instances it is very easy to miss the default (6) heartbeats since they are being
1381
      // sent very fast (50ms).
1382
1
      "--leader_failure_max_missed_heartbeat_periods=40.0"
1383
1
  };
1384
1385
1
  std::vector<std::string> master_flags = {"--enable_load_balancing=true"};
1386
1387
1
  ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags, 3));
1388
1389
1
  const auto kTimeout = 40s;
1390
1
  ASSERT_OK(cluster_->WaitForTabletServerCount(3, kTimeout));
1391
1392
  // Populate a tablet with some data.
1393
1
  LOG(INFO) << "Starting workload";
1394
1
  TestWorkload workload(cluster_.get());
1395
1
  workload.set_sequential_write(true);
1396
1
  workload.Setup(YBTableType::YQL_TABLE_TYPE);
1397
1
  workload.set_payload_bytes(1024);
1398
1
  workload.Start();
1399
1
  workload.WaitInserted(5000);
1400
1
  LOG(INFO) << "Stopping workload";
1401
1
  workload.StopAndJoin();
1402
1403
  // Figure out the tablet id of the created tablet.
1404
1
  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
1405
1
  TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()].get();
1406
1
  ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, kTimeout, &tablets));
1407
0
  string tablet_id = tablets[0].tablet_status().tablet_id();
1408
1409
0
  TServerDetails* leader_ts;
1410
  // Find out who's leader.
1411
0
  ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader_ts));
1412
1413
0
  TServerDetails* non_leader_ts = nullptr;
1414
0
  int non_leader_idx = -1;
1415
  // Find the first non-leader TS.
1416
0
  for (int i = 0; i < 3; i++) {
1417
0
    if (cluster_->tablet_server(i)->uuid() != leader_ts->uuid()) {
1418
0
      non_leader_ts = ts_map_[cluster_->tablet_server(i)->uuid()].get();
1419
0
      non_leader_idx = i;
1420
0
      break;
1421
0
    }
1422
0
  }
1423
1424
0
  ASSERT_NE(non_leader_ts, nullptr);
1425
0
  ASSERT_NE(non_leader_idx, -1);
1426
1427
0
  ASSERT_OK(WaitUntilTabletInState(non_leader_ts, tablet_id, tablet::RUNNING, kTimeout));
1428
1429
0
  auto* env = Env::Default();
1430
0
  const string data_dir = cluster_->tablet_server_by_uuid(non_leader_ts->uuid())->GetDataDirs()[0];
1431
0
  auto meta_dir = FsManager::GetRaftGroupMetadataDir(data_dir);
1432
0
  auto metadata_path = JoinPathSegments(meta_dir, tablet_id);
1433
0
  tablet::RaftGroupReplicaSuperBlockPB superblock;
1434
0
  ASSERT_OK(pb_util::ReadPBContainerFromPath(env, metadata_path, &superblock));
1435
0
  string tablet_data_dir = superblock.kv_store().rocksdb_dir();
1436
0
  const auto& rocksdb_files = ASSERT_RESULT(env->GetChildren(tablet_data_dir, ExcludeDots::kTrue));
1437
0
  ASSERT_GT(rocksdb_files.size(), 0);
1438
0
  for (const auto& file : rocksdb_files) {
1439
0
    if (file.size() > 4 && file.substr(file.size() - 4) == ".sst") {
1440
0
      ASSERT_OK(env->DeleteFile(JoinPathSegments(tablet_data_dir, file)));
1441
0
      LOG(INFO) << "Deleted file " << JoinPathSegments(tablet_data_dir, file);
1442
0
    }
1443
0
  }
1444
1445
  // Restart the tserver so that the tablet gets marked as FAILED when it's bootstrapped.
1446
  // Flag TEST_delay_removing_peer_with_failed_tablet_secs will keep the tablet in the FAILED state
1447
  // for the specified amount of time so that we can verify that it was indeed marked as FAILED.
1448
0
  cluster_->tablet_server_by_uuid(non_leader_ts->uuid())->Shutdown();
1449
0
  ASSERT_OK(cluster_->tablet_server_by_uuid(non_leader_ts->uuid())->Restart());
1450
1451
0
  ASSERT_OK(WaitUntilTabletInState(non_leader_ts, tablet_id, tablet::FAILED, kTimeout, 500ms));
1452
0
  LOG(INFO) << "Tablet " << tablet_id << " in state FAILED in tablet server "
1453
0
            << non_leader_ts->uuid();
1454
1455
  // After setting the tablet state to FAILED, the leader will detect that this TS needs to be
1456
  // removed from the config so that it can be remote bootstrapped again. Check that this process
1457
  // completes successfully.
1458
0
  ASSERT_OK(WaitUntilTabletInState(non_leader_ts, tablet_id, tablet::RUNNING, kTimeout));
1459
0
  LOG(INFO) << "Tablet " << tablet_id << " in state RUNNING in tablet server "
1460
0
            << non_leader_ts->uuid();
1461
1462
0
  ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(3, leader_ts, tablet_id, kTimeout));
1463
0
  LOG(INFO) << "Number of voters for tablet " << tablet_id << " is 3";
1464
1465
  // Ensure all the servers agree before we proceed.
1466
0
  ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map_, tablet_id, workload.batches_completed()));
1467
1468
0
  ClusterVerifier cluster_verifier(cluster_.get());
1469
0
  ASSERT_NO_FATALS(cluster_verifier.CheckCluster());
1470
0
  ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(),
1471
0
                                                  ClusterVerifier::AT_LEAST,
1472
0
                                                  workload.rows_inserted()));
1473
0
}
1474
1475
1
TEST_F(RemoteBootstrapITest, TestRejectRogueLeaderKeyValueType) {
1476
1
  RejectRogueLeader(YBTableType::YQL_TABLE_TYPE);
1477
1
}
1478
1479
1
TEST_F(RemoteBootstrapITest, TestDeleteTabletDuringRemoteBootstrapKeyValueType) {
1480
1
  DeleteTabletDuringRemoteBootstrap(YBTableType::YQL_TABLE_TYPE);
1481
1
}
1482
1483
1
TEST_F(RemoteBootstrapITest, TestRemoteBootstrapFollowerWithHigherTermKeyValueType) {
1484
1
  RemoteBootstrapFollowerWithHigherTerm(YBTableType::YQL_TABLE_TYPE);
1485
1
}
1486
1487
1
TEST_F(RemoteBootstrapITest, TestConcurrentRemoteBootstrapsKeyValueType) {
1488
1
  ConcurrentRemoteBootstraps(YBTableType::YQL_TABLE_TYPE);
1489
1
}
1490
1491
1
TEST_F(RemoteBootstrapITest, TestDeleteLeaderDuringRemoteBootstrapStressTestKeyValueType) {
1492
1
  DeleteLeaderDuringRemoteBootstrapStressTest(YBTableType::YQL_TABLE_TYPE);
1493
1
}
1494
1495
1
TEST_F(RemoteBootstrapITest, TestDisableRemoteBootstrap_NoTightLoopWhenTabletDeletedKeyValueType) {
1496
1
  DisableRemoteBootstrap_NoTightLoopWhenTabletDeleted(YBTableType::YQL_TABLE_TYPE);
1497
1
}
1498
1499
1
TEST_F(RemoteBootstrapITest, TestLeaderCrashesWhileFetchingDataKeyValueTableType) {
1500
1
  RemoteBootstrapITest::LeaderCrashesWhileFetchingData(YBTableType::YQL_TABLE_TYPE);
1501
1
}
1502
1503
1
TEST_F(RemoteBootstrapITest, TestLeaderCrashesBeforeChangeRoleKeyValueTableType) {
1504
1
  RemoteBootstrapITest::LeaderCrashesBeforeChangeRole(YBTableType::YQL_TABLE_TYPE);
1505
1
}
1506
1507
1
TEST_F(RemoteBootstrapITest, TestLeaderCrashesAfterChangeRoleKeyValueTableType) {
1508
1
  RemoteBootstrapITest::LeaderCrashesAfterChangeRole(YBTableType::YQL_TABLE_TYPE);
1509
1
}
1510
1511
1
TEST_F(RemoteBootstrapITest, TestClientCrashesBeforeChangeRoleKeyValueTableType) {
1512
1
  RemoteBootstrapITest::ClientCrashesBeforeChangeRole(YBTableType::YQL_TABLE_TYPE);
1513
1
}
1514
1515
}  // namespace yb