/Users/deen/code/yugabyte-db/src/yb/integration-tests/remote_bootstrap-itest.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | |
33 | | #include <memory> |
34 | | #include <string> |
35 | | #include <unordered_map> |
36 | | |
37 | | #include <boost/optional.hpp> |
38 | | #include <gtest/gtest.h> |
39 | | |
40 | | #include "yb/client/client-test-util.h" |
41 | | #include "yb/client/client.h" |
42 | | #include "yb/client/schema.h" |
43 | | #include "yb/client/table_creator.h" |
44 | | |
45 | | #include "yb/common/wire_protocol-test-util.h" |
46 | | |
47 | | #include "yb/fs/fs_manager.h" |
48 | | |
49 | | #include "yb/gutil/stl_util.h" |
50 | | #include "yb/gutil/strings/substitute.h" |
51 | | |
52 | | #include "yb/integration-tests/cluster_itest_util.h" |
53 | | #include "yb/integration-tests/cluster_verifier.h" |
54 | | #include "yb/integration-tests/external_mini_cluster.h" |
55 | | #include "yb/integration-tests/external_mini_cluster_fs_inspector.h" |
56 | | #include "yb/integration-tests/test_workload.h" |
57 | | |
58 | | #include "yb/tablet/tablet_bootstrap_if.h" |
59 | | #include "yb/tablet/tablet_metadata.h" |
60 | | |
61 | | #include "yb/tserver/remote_bootstrap_client.h" |
62 | | #include "yb/tserver/remote_bootstrap_session.h" |
63 | | #include "yb/tserver/tserver.pb.h" |
64 | | |
65 | | #include "yb/util/metrics.h" |
66 | | #include "yb/util/pb_util.h" |
67 | | #include "yb/util/pstack_watcher.h" |
68 | | #include "yb/util/status_log.h" |
69 | | #include "yb/util/test_util.h" |
70 | | #include "yb/util/tsan_util.h" |
71 | | |
72 | | using namespace std::literals; |
73 | | |
74 | | DEFINE_int32(test_delete_leader_num_iters, 3, |
75 | | "Number of iterations to run in TestDeleteLeaderDuringRemoteBootstrapStressTest."); |
76 | | DEFINE_int32(test_delete_leader_min_rows_per_iter, 200, |
77 | | "Minimum number of rows to insert per iteration " |
78 | | "in TestDeleteLeaderDuringRemoteBootstrapStressTest."); |
79 | | DEFINE_int32(test_delete_leader_payload_bytes, 16 * 1024, |
80 | | "Payload byte size in TestDeleteLeaderDuringRemoteBootstrapStressTest."); |
81 | | DEFINE_int32(test_delete_leader_num_writer_threads, 1, |
82 | | "Number of writer threads in TestDeleteLeaderDuringRemoteBootstrapStressTest."); |
83 | | DEFINE_int32(remote_bootstrap_itest_timeout_sec, 180, |
84 | | "Timeout in seconds to use in remote bootstrap integration test."); |
85 | | |
86 | | using yb::client::YBClient; |
87 | | using yb::client::YBClientBuilder; |
88 | | using yb::client::YBSchema; |
89 | | using yb::client::YBSchemaFromSchema; |
90 | | using yb::client::YBTableCreator; |
91 | | using yb::client::YBTableType; |
92 | | using yb::client::YBTableName; |
93 | | using std::shared_ptr; |
94 | | using yb::consensus::CONSENSUS_CONFIG_COMMITTED; |
95 | | using yb::consensus::PeerMemberType; |
96 | | using yb::consensus::RaftPeerPB; |
97 | | using yb::itest::TServerDetails; |
98 | | using yb::tablet::TABLET_DATA_READY; |
99 | | using yb::tablet::TABLET_DATA_TOMBSTONED; |
100 | | using yb::tserver::ListTabletsResponsePB; |
101 | | using yb::tserver::RemoteBootstrapClient; |
102 | | using std::string; |
103 | | using std::unordered_map; |
104 | | using std::vector; |
105 | | using strings::Substitute; |
106 | | |
107 | | METRIC_DECLARE_entity(server); |
108 | | METRIC_DECLARE_histogram(handler_latency_yb_consensus_ConsensusService_UpdateConsensus); |
109 | | METRIC_DECLARE_counter(glog_info_messages); |
110 | | METRIC_DECLARE_counter(glog_warning_messages); |
111 | | METRIC_DECLARE_counter(glog_error_messages); |
112 | | |
113 | | namespace yb { |
114 | | |
115 | | using yb::tablet::TabletDataState; |
116 | | |
117 | | class RemoteBootstrapITest : public YBTest { |
118 | | public: |
119 | 4 | void TearDown() override { |
120 | 4 | client_.reset(); |
121 | 4 | if (HasFatalFailure()) { |
122 | 0 | LOG(INFO) << "Found fatal failure"; |
123 | 0 | if (cluster_) { |
124 | 0 | for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { |
125 | 0 | if (!cluster_->tablet_server(i)->IsProcessAlive()) { |
126 | 0 | LOG(INFO) << "Tablet server " << i << " is not running. Cannot dump its stacks."; |
127 | 0 | continue; |
128 | 0 | } |
129 | 0 | LOG(INFO) << "Attempting to dump stacks of TS " << i |
130 | 0 | << " with UUID " << cluster_->tablet_server(i)->uuid() |
131 | 0 | << " and pid " << cluster_->tablet_server(i)->pid(); |
132 | 0 | WARN_NOT_OK(PstackWatcher::DumpPidStacks(cluster_->tablet_server(i)->pid()), |
133 | 0 | "Couldn't dump stacks"); |
134 | 0 | } |
135 | 0 | } |
136 | 4 | } else if (check_checkpoints_cleared_ && cluster_) { |
137 | 1 | CheckCheckpointsCleared(); |
138 | 1 | } |
139 | 4 | if (cluster_) { |
140 | 1 | cluster_->Shutdown(); |
141 | 1 | } |
142 | 4 | YBTest::TearDown(); |
143 | 4 | ts_map_.clear(); |
144 | 4 | } |
145 | | |
146 | | protected: |
147 | | void StartCluster(const vector<string>& extra_tserver_flags = vector<string>(), |
148 | | const vector<string>& extra_master_flags = vector<string>(), |
149 | | int num_tablet_servers = 3); |
150 | | |
151 | | void RejectRogueLeader(YBTableType table_type); |
152 | | void DeleteTabletDuringRemoteBootstrap(YBTableType table_type); |
153 | | void RemoteBootstrapFollowerWithHigherTerm(YBTableType table_type); |
154 | | void ConcurrentRemoteBootstraps(YBTableType table_type); |
155 | | void DeleteLeaderDuringRemoteBootstrapStressTest(YBTableType table_type); |
156 | | void DisableRemoteBootstrap_NoTightLoopWhenTabletDeleted(YBTableType table_type); |
157 | | |
158 | | void CrashTestSetUp(YBTableType table_type); |
159 | | void CrashTestVerify(); |
160 | | // The following tests verify that a newly added tserver gets bootstrapped even if the leader |
161 | | // crashes while bootstrapping it. |
162 | | void LeaderCrashesWhileFetchingData(YBTableType table_type); |
163 | | void LeaderCrashesBeforeChangeRole(YBTableType table_type); |
164 | | void LeaderCrashesAfterChangeRole(YBTableType table_type); |
165 | | |
166 | | void ClientCrashesBeforeChangeRole(YBTableType table_type); |
167 | | |
168 | | void StartCrashedTabletServer( |
169 | | TabletDataState expected_data_state = TabletDataState::TABLET_DATA_TOMBSTONED); |
170 | | void CheckCheckpointsCleared(); |
171 | | |
172 | | void CreateTableAssignLeaderAndWaitForTabletServersReady(const YBTableType table_type, |
173 | | const YBTableName& table_name, |
174 | | const int num_tablets, |
175 | | const int expected_num_tablets_per_ts, |
176 | | const int leader_index, |
177 | | const MonoDelta& timeout, |
178 | | vector<string>* tablet_ids); |
179 | | |
180 | | std::unique_ptr<ExternalMiniCluster> cluster_; |
181 | | std::unique_ptr<itest::ExternalMiniClusterFsInspector> inspect_; |
182 | | std::unique_ptr<YBClient> client_; |
183 | | itest::TabletServerMap ts_map_; |
184 | | |
185 | | MonoDelta crash_test_timeout_; |
186 | | const MonoDelta kWaitForCrashTimeout_ = 60s; |
187 | | vector<string> crash_test_tserver_flags_; |
188 | | std::unique_ptr<TestWorkload> crash_test_workload_; |
189 | | TServerDetails* crash_test_leader_ts_ = nullptr; |
190 | | int crash_test_tserver_index_ = -1; |
191 | | int crash_test_leader_index_ = -1; |
192 | | string crash_test_tablet_id_; |
193 | | bool check_checkpoints_cleared_ = true; |
194 | | }; |
195 | | |
196 | | void RemoteBootstrapITest::StartCluster(const vector<string>& extra_tserver_flags, |
197 | | const vector<string>& extra_master_flags, |
198 | 11 | int num_tablet_servers) { |
199 | 11 | ExternalMiniClusterOptions opts; |
200 | 11 | opts.num_tablet_servers = num_tablet_servers; |
201 | 11 | opts.extra_tserver_flags = extra_tserver_flags; |
202 | 11 | opts.extra_tserver_flags.emplace_back("--remote_bootstrap_idle_timeout_ms=10000"); |
203 | 11 | opts.extra_tserver_flags.emplace_back("--never_fsync"); // fsync causes flakiness on EC2. |
204 | 11 | if (IsTsan()) { |
205 | 0 | opts.extra_tserver_flags.emplace_back("--leader_failure_max_missed_heartbeat_periods=20"); |
206 | 0 | opts.extra_tserver_flags.emplace_back("--remote_bootstrap_begin_session_timeout_ms=13000"); |
207 | 0 | opts.extra_tserver_flags.emplace_back("--rpc_connection_timeout_ms=10000"); |
208 | 11 | } else if (IsSanitizer()) { |
209 | 0 | opts.extra_tserver_flags.emplace_back("--leader_failure_max_missed_heartbeat_periods=15"); |
210 | 0 | opts.extra_tserver_flags.emplace_back("--remote_bootstrap_begin_session_timeout_ms=10000"); |
211 | 0 | opts.extra_tserver_flags.emplace_back("--rpc_connection_timeout_ms=7500"); |
212 | 0 | } |
213 | | |
214 | 11 | opts.extra_master_flags = extra_master_flags; |
215 | 11 | cluster_.reset(new ExternalMiniCluster(opts)); |
216 | 11 | ASSERT_OK(cluster_->Start()); |
217 | 11 | inspect_.reset(new itest::ExternalMiniClusterFsInspector(cluster_.get())); |
218 | 11 | ts_map_ = ASSERT_RESULT(itest::CreateTabletServerMap(cluster_.get())); |
219 | 11 | client_ = ASSERT_RESULT(cluster_->CreateClient()); |
220 | 11 | } |
221 | | |
222 | 1 | void RemoteBootstrapITest::CheckCheckpointsCleared() { |
223 | 1 | auto* env = Env::Default(); |
224 | 1 | auto deadline = MonoTime::Now() + 10s * kTimeMultiplier; |
225 | 4 | for (size_t i = 0; i != cluster_->num_tablet_servers(); ++i) { |
226 | 3 | auto tablet_server = cluster_->tablet_server(i); |
227 | 3 | auto data_dir = tablet_server->GetDataDirs()[0]; |
228 | 3 | SCOPED_TRACE(Format("Index: $0", i)); |
229 | 3 | SCOPED_TRACE(Format("UUID: $0", tablet_server->uuid())); |
230 | 3 | SCOPED_TRACE(Format("Data dir: $0", data_dir)); |
231 | 3 | auto meta_dir = FsManager::GetRaftGroupMetadataDir(data_dir); |
232 | 3 | auto tablets = ASSERT_RESULT(env->GetChildren(meta_dir, ExcludeDots::kTrue)); |
233 | 72 | for (const auto& tablet : tablets) { |
234 | 72 | SCOPED_TRACE(Format("Tablet: $0", tablet)); |
235 | 72 | auto metadata_path = JoinPathSegments(meta_dir, tablet); |
236 | 72 | tablet::RaftGroupReplicaSuperBlockPB superblock; |
237 | 72 | if (!pb_util::ReadPBContainerFromPath(env, metadata_path, &superblock).ok()) { |
238 | | // There is a race condition between this and a DeleteTablet request. So skip this tablet |
239 | | // if we cant' read the superblock. |
240 | 0 | continue; |
241 | 0 | } |
242 | 72 | auto checkpoints_dir = JoinPathSegments( |
243 | 72 | superblock.kv_store().rocksdb_dir(), tserver::RemoteBootstrapSession::kCheckpointsDir); |
244 | 72 | ASSERT_OK(Wait([env, checkpoints_dir]() -> bool { |
245 | 72 | if (env->FileExists(checkpoints_dir)) { |
246 | 72 | auto checkpoints = CHECK_RESULT(env->GetChildren(checkpoints_dir, ExcludeDots::kTrue)); |
247 | 72 | if (!checkpoints.empty()) { |
248 | 72 | LOG(INFO) << "Checkpoints: " << yb::ToString(checkpoints); |
249 | 72 | return false; |
250 | 72 | } |
251 | 72 | } |
252 | 72 | return true; |
253 | 72 | }, deadline, "Wait checkpoints empty")); |
254 | 72 | } |
255 | 3 | } |
256 | 1 | } |
257 | | |
258 | 4 | void RemoteBootstrapITest::CrashTestSetUp(YBTableType table_type) { |
259 | 4 | crash_test_tserver_flags_.push_back("--log_segment_size_mb=1"); // Faster log rolls. |
260 | | // Start the cluster with load balancer turned off. |
261 | 4 | vector<string> master_flags; |
262 | 4 | master_flags.push_back("--enable_load_balancing=false"); |
263 | 4 | master_flags.push_back("--replication_factor=4"); |
264 | 4 | ASSERT_NO_FATALS(StartCluster(crash_test_tserver_flags_, master_flags, 5)); |
265 | 4 | crash_test_tserver_index_ = 0; // We'll test with the first TS. |
266 | | |
267 | 4 | LOG(INFO) << "Started cluster"; |
268 | | // We'll do a config change to remote bootstrap a replica here later. For |
269 | | // now, shut it down. |
270 | 4 | LOG(INFO) << "Shutting down TS " << cluster_->tablet_server(crash_test_tserver_index_)->uuid(); |
271 | 4 | cluster_->tablet_server(crash_test_tserver_index_)->Shutdown(); |
272 | | |
273 | | // Bounce the Master so it gets new tablet reports and doesn't try to assign |
274 | | // a replica to the dead TS. |
275 | 4 | cluster_->master()->Shutdown(); |
276 | 4 | ASSERT_OK(cluster_->master()->Restart()); |
277 | 4 | ASSERT_OK(cluster_->WaitForTabletServerCount(4, crash_test_timeout_)); |
278 | | |
279 | | // Start a workload on the cluster, and run it for a little while. |
280 | 4 | crash_test_workload_.reset(new TestWorkload(cluster_.get())); |
281 | 4 | crash_test_workload_->set_sequential_write(true); |
282 | 4 | crash_test_workload_->Setup(); |
283 | 4 | ASSERT_OK(inspect_->WaitForReplicaCount(4)); |
284 | | |
285 | 0 | vector<string> tablets = inspect_->ListTabletsOnTS(1); |
286 | 0 | ASSERT_EQ(1, tablets.size()); |
287 | 0 | crash_test_tablet_id_ = tablets[0]; |
288 | |
|
289 | 0 | crash_test_workload_->Start(); |
290 | 0 | crash_test_workload_->WaitInserted(100); |
291 | | |
292 | | // Remote bootstrap doesn't see the active WAL segment, and we need to |
293 | | // download a file to trigger the fault in this test. Due to the log index |
294 | | // chunks, that means 3 files minimum: One in-flight WAL segment, one index |
295 | | // chunk file (these files grow much more slowly than the WAL segments), and |
296 | | // one completed WAL segment. |
297 | 0 | crash_test_leader_ts_ = nullptr; |
298 | 0 | ASSERT_OK(FindTabletLeader(ts_map_, crash_test_tablet_id_, crash_test_timeout_, |
299 | 0 | &crash_test_leader_ts_)); |
300 | 0 | crash_test_leader_index_ = cluster_->tablet_server_index_by_uuid(crash_test_leader_ts_->uuid()); |
301 | 0 | ASSERT_NE(-1, crash_test_leader_index_); |
302 | 0 | ASSERT_OK(inspect_->WaitForMinFilesInTabletWalDirOnTS(crash_test_leader_index_, |
303 | 0 | crash_test_tablet_id_, 3)); |
304 | 0 | crash_test_workload_->StopAndJoin(); |
305 | 0 | } |
306 | | |
307 | 0 | void RemoteBootstrapITest::CrashTestVerify() { |
308 | | // Wait until the tablet has been tombstoned in TS 0. This will happen after a call to |
309 | | // rb_client->Finish() tries to ends the remote bootstrap session with the crashed leader. The |
310 | | // returned error will cause the tablet to be tombstoned by the TOMBSTONE_NOT_OK macro. |
311 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(0, crash_test_tablet_id_, |
312 | 0 | TABLET_DATA_TOMBSTONED, crash_test_timeout_)); |
313 | | |
314 | | // After crash_test_leader_ts_ crashes, a new leader will be elected. This new leader will detect |
315 | | // that TS 0 needs to be remote bootstrapped. Verify that this process completes successfully. |
316 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(0, crash_test_tablet_id_, |
317 | 0 | TABLET_DATA_READY, crash_test_timeout_ * 3)); |
318 | 0 | auto dead_leader = crash_test_leader_ts_; |
319 | 0 | LOG(INFO) << "Dead leader: " << dead_leader->ToString(); |
320 | 0 | MonoTime start_time = MonoTime::Now(); |
321 | 0 | Status status; |
322 | 0 | TServerDetails* new_leader; |
323 | 0 | do { |
324 | 0 | ASSERT_OK(FindTabletLeader(ts_map_, crash_test_tablet_id_, crash_test_timeout_, &new_leader)); |
325 | 0 | status = WaitUntilCommittedConfigNumVotersIs(5, new_leader, crash_test_tablet_id_, |
326 | 0 | MonoDelta::FromSeconds(1)); |
327 | 0 | if (status.ok()) { |
328 | 0 | break; |
329 | 0 | } |
330 | 0 | } while (MonoTime::Now().GetDeltaSince(start_time).ToSeconds() < crash_test_timeout_.ToSeconds()); |
331 | |
|
332 | 0 | CHECK_OK(status); |
333 | |
|
334 | 0 | start_time = MonoTime::Now(); |
335 | 0 | do { |
336 | 0 | ASSERT_OK(FindTabletLeader(ts_map_, crash_test_tablet_id_, crash_test_timeout_, |
337 | 0 | &crash_test_leader_ts_)); |
338 | |
|
339 | 0 | Status s = RemoveServer(new_leader, crash_test_tablet_id_, dead_leader, boost::none, |
340 | 0 | MonoDelta::FromSeconds(1), NULL, false /* retry */); |
341 | 0 | if (s.ok()) { |
342 | 0 | break; |
343 | 0 | } |
344 | | |
345 | | // Ignore the return status if the leader is not ready or if the leader changed. |
346 | 0 | if (s.ToString().find("Leader is not ready") == string::npos && |
347 | 0 | s.ToString().find("is not leader of this config") == string::npos) { |
348 | 0 | CHECK_OK(s); |
349 | 0 | } |
350 | |
|
351 | 0 | SleepFor(MonoDelta::FromMilliseconds(500)); |
352 | 0 | } while (MonoTime::Now().GetDeltaSince(start_time).ToSeconds() < crash_test_timeout_.ToSeconds()); |
353 | |
|
354 | 0 | ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(4, new_leader, crash_test_tablet_id_, |
355 | 0 | crash_test_timeout_)); |
356 | |
|
357 | 0 | ClusterVerifier cluster_verifier(cluster_.get()); |
358 | | // Skip cluster_verifier.CheckCluster() because it calls ListTabletServers which gets its list |
359 | | // from TSManager::GetAllDescriptors. This list includes the tserver that is in a crash loop, and |
360 | | // the check will always fail. |
361 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(crash_test_workload_->table_name(), |
362 | 0 | ClusterVerifier::AT_LEAST, |
363 | 0 | crash_test_workload_->rows_inserted())); |
364 | |
|
365 | 0 | StartCrashedTabletServer(); |
366 | 0 | } |
367 | | |
368 | 0 | void RemoteBootstrapITest::StartCrashedTabletServer(TabletDataState expected_data_state) { |
369 | | // Restore leader so it could cleanup checkpoint. |
370 | 0 | LOG(INFO) << "Starting crashed " << crash_test_leader_index_; |
371 | | // Actually it is already stopped, calling shutdown to synchronize state. |
372 | 0 | cluster_->tablet_server(crash_test_leader_index_)->Shutdown(); |
373 | 0 | ASSERT_OK(cluster_->tablet_server(crash_test_leader_index_)->Start()); |
374 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS( |
375 | 0 | crash_test_leader_index_, crash_test_tablet_id_, expected_data_state)); |
376 | 0 | } |
377 | | |
378 | | // If a rogue (a.k.a. zombie) leader tries to remote bootstrap a tombstoned |
379 | | // tablet, make sure its term isn't older than the latest term we observed. |
380 | | // If it is older, make sure we reject the request, to avoid allowing old |
381 | | // leaders to create a parallel universe. This is possible because config |
382 | | // change could cause nodes to move around. The term check is reasonable |
383 | | // because only one node can be elected leader for a given term. |
384 | | // |
385 | | // A leader can "go rogue" due to a VM pause, CTRL-z, partition, etc. |
386 | 1 | void RemoteBootstrapITest::RejectRogueLeader(YBTableType table_type) { |
387 | | // This test pauses for at least 10 seconds. Only run in slow-test mode. |
388 | 1 | if (!AllowSlowTests()) { |
389 | 1 | LOG(INFO) << "Skipping test in fast-test mode."; |
390 | 1 | return; |
391 | 1 | } |
392 | | |
393 | 0 | std::vector<std::string> ts_flags = { |
394 | 0 | "--enable_leader_failure_detection=false"s, |
395 | 0 | }; |
396 | 0 | std::vector<std::string> master_flags = { |
397 | 0 | "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s, |
398 | 0 | "--use_create_table_leader_hint=false"s, |
399 | 0 | }; |
400 | 0 | ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags)); |
401 | |
|
402 | 0 | const MonoDelta timeout = MonoDelta::FromSeconds(30); |
403 | 0 | const int kTsIndex = 0; // We'll test with the first TS. |
404 | 0 | TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()].get(); |
405 | |
|
406 | 0 | TestWorkload workload(cluster_.get()); |
407 | 0 | workload.set_sequential_write(true); |
408 | 0 | workload.Setup(table_type); |
409 | | |
410 | | // Figure out the tablet id of the created tablet. |
411 | 0 | vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
412 | 0 | ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets)); |
413 | 0 | string tablet_id = tablets[0].tablet_status().tablet_id(); |
414 | | |
415 | | // Wait until all replicas are up and running. |
416 | 0 | for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { |
417 | 0 | ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(), |
418 | 0 | tablet_id, timeout)); |
419 | 0 | } |
420 | | |
421 | | // Elect a leader for term 1, then run some data through the cluster. |
422 | 0 | int zombie_leader_index = 1; |
423 | 0 | string zombie_leader_uuid = cluster_->tablet_server(zombie_leader_index)->uuid(); |
424 | 0 | ASSERT_OK(itest::StartElection(ts_map_[zombie_leader_uuid].get(), tablet_id, timeout)); |
425 | 0 | workload.Start(); |
426 | 0 | workload.WaitInserted(100); |
427 | 0 | workload.StopAndJoin(); |
428 | |
|
429 | 0 | ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed())); |
430 | | |
431 | | // Come out of the blue and try to remotely bootstrap a running server while |
432 | | // specifying an old term. That running server should reject the request. |
433 | | // We are essentially masquerading as a rogue leader here. |
434 | 0 | Status s = itest::StartRemoteBootstrap(ts, tablet_id, zombie_leader_uuid, |
435 | 0 | HostPort(cluster_->tablet_server(1)->bound_rpc_addr()), |
436 | 0 | 0, // Say I'm from term 0. |
437 | 0 | timeout); |
438 | 0 | ASSERT_TRUE(s.IsInvalidArgument()); |
439 | 0 | ASSERT_STR_CONTAINS(s.ToString(), "term 0 lower than last logged term 1"); |
440 | | |
441 | | // Now pause the actual leader so we can bring him back as a zombie later. |
442 | 0 | ASSERT_OK(cluster_->tablet_server(zombie_leader_index)->Pause()); |
443 | | |
444 | | // Trigger TS 2 to become leader of term 2. |
445 | 0 | int new_leader_index = 2; |
446 | 0 | string new_leader_uuid = cluster_->tablet_server(new_leader_index)->uuid(); |
447 | 0 | ASSERT_OK(itest::StartElection(ts_map_[new_leader_uuid].get(), tablet_id, timeout)); |
448 | 0 | ASSERT_OK(itest::WaitUntilLeader(ts_map_[new_leader_uuid].get(), tablet_id, timeout)); |
449 | |
|
450 | 0 | auto active_ts_map = CreateTabletServerMapUnowned(ts_map_); |
451 | 0 | ASSERT_EQ(1, active_ts_map.erase(zombie_leader_uuid)); |
452 | | |
453 | | // Wait for the NO_OP entry from the term 2 election to propagate to the |
454 | | // remaining nodes' logs so that we are guaranteed to reject the rogue |
455 | | // leader's remote bootstrap request when we bring it back online. |
456 | 0 | auto log_index = workload.batches_completed() + 2; // 2 terms == 2 additional NO_OP entries. |
457 | 0 | ASSERT_OK(WaitForServersToAgree(timeout, active_ts_map, tablet_id, log_index)); |
458 | | // TODO: Write more rows to the new leader once KUDU-1034 is fixed. |
459 | | |
460 | | // Now kill the new leader and tombstone the replica on TS 0. |
461 | 0 | cluster_->tablet_server(new_leader_index)->Shutdown(); |
462 | 0 | ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout)); |
463 | | |
464 | | // Zombies!!! Resume the rogue zombie leader. |
465 | | // He should attempt to remote bootstrap TS 0 but fail. |
466 | 0 | ASSERT_OK(cluster_->tablet_server(zombie_leader_index)->Resume()); |
467 | | |
468 | | // Loop for a few seconds to ensure that the tablet doesn't transition to READY. |
469 | 0 | MonoTime deadline = MonoTime::Now(); |
470 | 0 | deadline.AddDelta(MonoDelta::FromSeconds(5)); |
471 | 0 | while (MonoTime::Now().ComesBefore(deadline)) { |
472 | 0 | ASSERT_OK(itest::ListTablets(ts, timeout, &tablets)); |
473 | 0 | ASSERT_EQ(1, tablets.size()); |
474 | 0 | ASSERT_EQ(TABLET_DATA_TOMBSTONED, tablets[0].tablet_status().tablet_data_state()); |
475 | 0 | SleepFor(MonoDelta::FromMilliseconds(10)); |
476 | 0 | } |
477 | | |
478 | | // Force the rogue leader to step down. |
479 | | // Then, send a remote bootstrap start request from a "fake" leader that |
480 | | // sends an up-to-date term in the RB request but the actual term stored |
481 | | // in the bootstrap source's consensus metadata would still be old. |
482 | 0 | LOG(INFO) << "Forcing rogue leader T " << tablet_id << " P " << zombie_leader_uuid |
483 | 0 | << " to step down..."; |
484 | 0 | ASSERT_OK(itest::LeaderStepDown(ts_map_[zombie_leader_uuid].get(), tablet_id, nullptr, timeout)); |
485 | 0 | ExternalTabletServer* zombie_ets = cluster_->tablet_server(zombie_leader_index); |
486 | | // It's not necessarily part of the API but this could return faliure due to |
487 | | // rejecting the remote. We intend to make that part async though, so ignoring |
488 | | // this return value in this test. |
489 | 0 | WARN_NOT_OK(itest::StartRemoteBootstrap(ts, tablet_id, zombie_leader_uuid, |
490 | 0 | HostPort(zombie_ets->bound_rpc_addr()), |
491 | 0 | 2, // Say I'm from term 2. |
492 | 0 | timeout), |
493 | 0 | "Start remote bootstrap failed"); |
494 | | |
495 | | // Wait another few seconds to be sure the remote bootstrap is rejected. |
496 | 0 | deadline = MonoTime::Now(); |
497 | 0 | deadline.AddDelta(MonoDelta::FromSeconds(5)); |
498 | 0 | while (MonoTime::Now().ComesBefore(deadline)) { |
499 | 0 | ASSERT_OK(itest::ListTablets(ts, timeout, &tablets)); |
500 | 0 | ASSERT_EQ(1, tablets.size()); |
501 | 0 | ASSERT_EQ(TABLET_DATA_TOMBSTONED, tablets[0].tablet_status().tablet_data_state()); |
502 | 0 | SleepFor(MonoDelta::FromMilliseconds(10)); |
503 | 0 | } |
504 | |
|
505 | 0 | ClusterVerifier cluster_verifier(cluster_.get()); |
506 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckCluster()); |
507 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), ClusterVerifier::EXACTLY, |
508 | 0 | workload.rows_inserted())); |
509 | 0 | } |
510 | | |
511 | | // Start remote bootstrap session and delete the tablet in the middle. |
512 | | // It should actually be possible to complete bootstrap in such a case, because |
513 | | // when a remote bootstrap session is started on the "source" server, all of |
514 | | // the relevant files are either read or opened, meaning that an in-progress |
515 | | // remote bootstrap can complete even after a tablet is officially "deleted" on |
516 | | // the source server. This is also a regression test for KUDU-1009. |
517 | | |
518 | | // For yugbayteDB we have modified this test. We no longer expect the remote bootstrap |
519 | | // to finish successfully after the tablet has been deleted in the leader, but we do |
520 | | // expect the leader to not crash. |
521 | 1 | void RemoteBootstrapITest::DeleteTabletDuringRemoteBootstrap(YBTableType table_type) { |
522 | 1 | MonoDelta timeout = MonoDelta::FromSeconds(10); |
523 | 1 | const int kTsIndex = 0; // We'll test with the first TS. |
524 | 1 | ASSERT_NO_FATALS(StartCluster()); |
525 | | |
526 | | // Populate a tablet with some data. |
527 | 1 | TestWorkload workload(cluster_.get()); |
528 | 1 | workload.set_sequential_write(true); |
529 | 1 | workload.Setup(table_type); |
530 | 1 | workload.Start(); |
531 | 1 | workload.WaitInserted(1000); |
532 | | |
533 | | // Figure out the tablet id of the created tablet. |
534 | 1 | vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
535 | 1 | TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()].get(); |
536 | 1 | ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets)); |
537 | 0 | string tablet_id = tablets[0].tablet_status().tablet_id(); |
538 | | |
539 | | // Ensure all the servers agree before we proceed. |
540 | 0 | workload.StopAndJoin(); |
541 | 0 | ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed())); |
542 | | |
543 | | // Set up an FsManager to use with the RemoteBootstrapClient. |
544 | 0 | FsManagerOpts opts; |
545 | 0 | string testbase = GetTestPath("fake-ts"); |
546 | 0 | ASSERT_OK(env_->CreateDir(testbase)); |
547 | 0 | opts.wal_paths.push_back(JoinPathSegments(testbase, "wals")); |
548 | 0 | opts.data_paths.push_back(JoinPathSegments(testbase, "data-0")); |
549 | 0 | opts.server_type = "tserver_test"; |
550 | 0 | std::unique_ptr<FsManager> fs_manager(new FsManager(env_.get(), opts)); |
551 | 0 | ASSERT_OK(fs_manager->CreateInitialFileSystemLayout()); |
552 | 0 | ASSERT_OK(fs_manager->Open()); |
553 | | |
554 | | // Start up a RemoteBootstrapClient and open a remote bootstrap session. |
555 | 0 | RemoteBootstrapClient rb_client(tablet_id, fs_manager.get()); |
556 | 0 | scoped_refptr<tablet::RaftGroupMetadata> meta; |
557 | 0 | ASSERT_OK(rb_client.Start(cluster_->tablet_server(kTsIndex)->uuid(), |
558 | 0 | &cluster_->proxy_cache(), |
559 | 0 | cluster_->tablet_server(kTsIndex)->bound_rpc_hostport(), |
560 | 0 | &meta)); |
561 | | |
562 | | // Tombstone the tablet on the remote! |
563 | 0 | ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout)); |
564 | | |
565 | | // Now finish bootstrapping! |
566 | 0 | tablet::TabletStatusListener listener(meta); |
567 | | |
568 | | // This will fail because the leader won't have any rocksdb files since they were deleted when |
569 | | // we called DeleteTablet. |
570 | 0 | ASSERT_NOK(rb_client.FetchAll(&listener)); |
571 | 0 | ASSERT_OK(rb_client.EndRemoteSession()); |
572 | 0 | ASSERT_OK(rb_client.Remove()); |
573 | |
|
574 | 0 | SleepFor(MonoDelta::FromMilliseconds(500)); // Give a little time for a crash (KUDU-1009). |
575 | 0 | ASSERT_TRUE(cluster_->tablet_server(kTsIndex)->IsProcessAlive()); |
576 | |
|
577 | 0 | ClusterVerifier cluster_verifier(cluster_.get()); |
578 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckCluster()); |
579 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), ClusterVerifier::EXACTLY, |
580 | 0 | workload.rows_inserted())); |
581 | 0 | } |
582 | | |
583 | 1 | TEST_F(RemoteBootstrapITest, IncompleteWALDownloadDoesntCauseCrash) { |
584 | 1 | std::vector<std::string> master_flags = { |
585 | 1 | "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s, |
586 | 1 | "--enable_load_balancing=false"s, |
587 | 1 | "--use_create_table_leader_hint=false"s, |
588 | 1 | }; |
589 | | |
590 | 1 | int constexpr kBootstrapIdleTimeoutMs = 5000; |
591 | | |
592 | 1 | std::vector<std::string> ts_flags = { |
593 | 1 | "--log_min_segments_to_retain=1000"s, |
594 | 1 | "--log_min_seconds_to_retain=900"s, |
595 | 1 | "--log_segment_size_bytes=65536"s, |
596 | 1 | "--enable_leader_failure_detection=false"s, |
597 | | // Disable pre-elections since we wait for term to become 2, |
598 | | // that does not happen with pre-elections. |
599 | 1 | "--use_preelection=false"s, |
600 | 1 | "--memstore_size_mb=1"s, |
601 | 1 | "--TEST_download_partial_wal_segments=true"s, |
602 | 1 | "--remote_bootstrap_idle_timeout_ms="s + std::to_string(kBootstrapIdleTimeoutMs) |
603 | 1 | }; |
604 | | |
605 | 1 | const int kNumTabletServers = 3; |
606 | 1 | ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags, kNumTabletServers)); |
607 | | |
608 | | // Elect a leader for term 1, then run some data through the cluster. |
609 | 1 | const int kLeaderIndex = 1; |
610 | | |
611 | 1 | constexpr int kNumTablets = RegularBuildVsSanitizers(10, 2); |
612 | 1 | MonoDelta timeout = MonoDelta::FromSeconds(RegularBuildVsSanitizers(15, 45)); |
613 | | |
614 | 1 | vector<string> tablet_ids; |
615 | | |
616 | 1 | CreateTableAssignLeaderAndWaitForTabletServersReady(YBTableType::YQL_TABLE_TYPE, |
617 | 1 | TestWorkloadOptions::kDefaultTableName, kNumTablets, kNumTablets, kLeaderIndex, timeout, |
618 | 1 | &tablet_ids); |
619 | | |
620 | 1 | const int kFollowerIndex = 0; |
621 | 1 | TServerDetails* follower_ts = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()].get(); |
622 | | |
623 | 1 | TestWorkload workload(cluster_.get()); |
624 | 1 | workload.set_write_timeout_millis(10000); |
625 | 1 | workload.set_timeout_allowed(true); |
626 | 1 | workload.set_write_batch_size(10); |
627 | 1 | workload.set_num_write_threads(10); |
628 | 1 | workload.set_sequential_write(true); |
629 | 1 | workload.Setup(YBTableType::YQL_TABLE_TYPE); |
630 | 1 | workload.Start(); |
631 | 1 | workload.WaitInserted(RegularBuildVsSanitizers(500, 5000)); |
632 | 1 | workload.StopAndJoin(); |
633 | | |
634 | | // Ensure all the servers agree before we proceed. |
635 | 0 | for (const auto& tablet_id : tablet_ids) { |
636 | 0 | ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); |
637 | 0 | } |
638 | 1 | LOG(INFO) << "All servers agree"; |
639 | | // Now tombstone the follower. |
640 | 1 | ASSERT_OK(itest::DeleteTablet(follower_ts, tablet_ids[0], TABLET_DATA_TOMBSTONED, boost::none, |
641 | 1 | timeout)); |
642 | 0 | LOG(INFO) << "Successfully sent delete request " << tablet_ids[0] << " on TS " << kFollowerIndex; |
643 | | // Wait until the tablet has been tombstoned on the follower. |
644 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_ids[0], |
645 | 0 | tablet::TABLET_DATA_TOMBSTONED, timeout)); |
646 | 0 | LOG(INFO) << "Tablet state on TS " << kFollowerIndex << " is TABLET_DATA_TOMBSTONED"; |
647 | 0 | SleepFor(MonoDelta::FromMilliseconds(5000)); // Give a little time for a crash. |
648 | 0 | ASSERT_TRUE(cluster_->tablet_server(kFollowerIndex)->IsProcessAlive()); |
649 | 0 | LOG(INFO) << "TS " << kFollowerIndex << " didn't crash"; |
650 | | // Now remove the crash flag, so the next replay will complete. |
651 | 0 | ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(kFollowerIndex), |
652 | 0 | "TEST_download_partial_wal_segments", "false")); |
653 | 0 | LOG(INFO) << "Successfully turned off flag TEST_download_partial_wal_segments"; |
654 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_ids[0], |
655 | 0 | tablet::TABLET_DATA_READY, timeout * 4)); |
656 | 0 | LOG(INFO) << "Tablet " << tablet_ids[0] |
657 | 0 | << " is in state TABLET_DATA_READY in TS " << kFollowerIndex; |
658 | 0 | TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()].get(); |
659 | 0 | OpId op_id = ASSERT_RESULT(GetLastOpIdForReplica( |
660 | 0 | tablet_ids[0], leader_ts, consensus::COMMITTED_OPID, timeout)); |
661 | |
|
662 | 0 | auto expected_index = op_id.index; |
663 | |
|
664 | 0 | ASSERT_OK(WaitForServersToAgree(timeout, |
665 | 0 | ts_map_, |
666 | 0 | tablet_ids[0], |
667 | 0 | expected_index, |
668 | 0 | &expected_index)); |
669 | |
|
670 | 0 | LOG(INFO) << "Op id index in TS " << kFollowerIndex << " is " << op_id.index |
671 | 0 | << " for tablet " << tablet_ids[0]; |
672 | |
|
673 | 0 | ClusterVerifier cluster_verifier(cluster_.get()); |
674 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckCluster()); |
675 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), ClusterVerifier::EXACTLY, |
676 | 0 | workload.rows_inserted())); |
677 | |
|
678 | 0 | LOG(INFO) << "Cluster verifier succeeded"; |
679 | | |
680 | | // Sleep to make sure all the remote bootstrap sessions that were initiated but |
681 | | // not completed are expired. |
682 | 0 | SleepFor(MonoDelta::FromMilliseconds(kBootstrapIdleTimeoutMs * 2)); |
683 | 0 | } |
684 | | |
685 | | // This test ensures that a leader can remote-bootstrap a tombstoned replica |
686 | | // that has a higher term recorded in the replica's consensus metadata if the |
687 | | // replica's last-logged opid has the same term (or less) as the leader serving |
688 | | // as the remote bootstrap source. When a tablet is tombstoned, its last-logged |
689 | | // opid is stored in a field its on-disk superblock. |
690 | 1 | void RemoteBootstrapITest::RemoteBootstrapFollowerWithHigherTerm(YBTableType table_type) { |
691 | 1 | std::vector<std::string> ts_flags = { |
692 | 1 | "--enable_leader_failure_detection=false"s, |
693 | | // Disable pre-elections since we wait for term to become 2, |
694 | | // that does not happen with pre-elections |
695 | 1 | "--use_preelection=false"s |
696 | 1 | }; |
697 | | |
698 | 1 | std::vector<std::string> master_flags = { |
699 | 1 | "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s, |
700 | 1 | "--replication_factor=2"s, |
701 | 1 | "--use_create_table_leader_hint=false"s, |
702 | 1 | }; |
703 | | |
704 | 1 | const int kNumTabletServers = 2; |
705 | 1 | ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags, kNumTabletServers)); |
706 | | |
707 | 1 | const MonoDelta timeout = MonoDelta::FromSeconds(30); |
708 | 1 | const int kFollowerIndex = 0; |
709 | 1 | TServerDetails* follower_ts = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()].get(); |
710 | | |
711 | 1 | TestWorkload workload(cluster_.get()); |
712 | 1 | workload.set_sequential_write(true); |
713 | 1 | workload.Setup(table_type); |
714 | | |
715 | | // Figure out the tablet id of the created tablet. |
716 | 1 | vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
717 | 1 | ASSERT_OK(WaitForNumTabletsOnTS(follower_ts, 1, timeout, &tablets)); |
718 | 1 | string tablet_id = tablets[0].tablet_status().tablet_id(); |
719 | | |
720 | | // Wait until all replicas are up and running. |
721 | 3 | for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { |
722 | 2 | ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(), |
723 | 2 | tablet_id, timeout)); |
724 | 2 | } |
725 | | |
726 | | // Elect a leader for term 1, then run some data through the cluster. |
727 | 1 | const int kLeaderIndex = 1; |
728 | 1 | TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()].get(); |
729 | 1 | ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout)); |
730 | 1 | workload.Start(); |
731 | 1 | workload.WaitInserted(100); |
732 | 1 | workload.StopAndJoin(); |
733 | | |
734 | 1 | ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed())); |
735 | | |
736 | | // Pause the leader and increment the term on the follower by starting an |
737 | | // election on the follower. The election will fail asynchronously but we |
738 | | // just wait until we see that its term has incremented. |
739 | 0 | ASSERT_OK(cluster_->tablet_server(kLeaderIndex)->Pause()); |
740 | 0 | ASSERT_OK(itest::StartElection( |
741 | 0 | follower_ts, tablet_id, timeout, consensus::TEST_SuppressVoteRequest::kTrue)); |
742 | 0 | int64_t term = 0; |
743 | 0 | for (int i = 0; i < 1000; i++) { |
744 | 0 | consensus::ConsensusStatePB cstate; |
745 | 0 | ASSERT_OK(itest::GetConsensusState(follower_ts, tablet_id, CONSENSUS_CONFIG_COMMITTED, |
746 | 0 | timeout, &cstate)); |
747 | 0 | term = cstate.current_term(); |
748 | 0 | if (term == 2) break; |
749 | 0 | SleepFor(MonoDelta::FromMilliseconds(10)); |
750 | 0 | } |
751 | 0 | ASSERT_EQ(2, term); |
752 | | |
753 | | // Now tombstone the follower. |
754 | 0 | ASSERT_OK(itest::DeleteTablet(follower_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, |
755 | 0 | timeout)); |
756 | | |
757 | | // Wait until the tablet has been tombstoned on the follower. |
758 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, |
759 | 0 | tablet::TABLET_DATA_TOMBSTONED, timeout)); |
760 | | |
761 | | // Now wake the leader. It should detect that the follower needs to be |
762 | | // remotely bootstrapped and proceed to bring it back up to date. |
763 | 0 | ASSERT_OK(cluster_->tablet_server(kLeaderIndex)->Resume()); |
764 | | |
765 | | // Wait for remote bootstrap to complete successfully. |
766 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, |
767 | 0 | tablet::TABLET_DATA_READY, timeout)); |
768 | | |
769 | | // Wait for the follower to come back up. |
770 | 0 | ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed())); |
771 | |
|
772 | 0 | ClusterVerifier cluster_verifier(cluster_.get()); |
773 | | // During this test we disable leader failure detection. |
774 | | // So we use CONSISTENT_PREFIX for verification because it could end up w/o leader at all. We also |
775 | | // don't call CheckCluster for this reason. |
776 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckRowCount( |
777 | 0 | workload.table_name(), ClusterVerifier::EXACTLY, workload.rows_inserted(), |
778 | 0 | YBConsistencyLevel::CONSISTENT_PREFIX)); |
779 | 0 | } |
780 | | |
781 | | void RemoteBootstrapITest::CreateTableAssignLeaderAndWaitForTabletServersReady( |
782 | | const YBTableType table_type, const YBTableName& table_name, const int num_tablets, |
783 | | const int expected_num_tablets_per_ts, const int leader_index, const MonoDelta& timeout, |
784 | 4 | vector<string>* tablet_ids) { |
785 | | |
786 | 4 | ASSERT_OK(client_->CreateNamespaceIfNotExists(table_name.namespace_name())); |
787 | | |
788 | | // Create a table with several tablets. These will all be simultaneously |
789 | | // remotely bootstrapped to a single target node from the same leader host. |
790 | 4 | YBSchema client_schema(YBSchemaFromSchema(GetSimpleTestSchema())); |
791 | 4 | std::unique_ptr<YBTableCreator> table_creator(client_->NewTableCreator()); |
792 | 4 | ASSERT_OK(table_creator->table_name(table_name) |
793 | 4 | .num_tablets(num_tablets) |
794 | 4 | .schema(&client_schema) |
795 | 4 | .table_type(table_type) |
796 | 4 | .Create()); |
797 | | |
798 | 4 | TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()].get(); |
799 | | |
800 | | // Figure out the tablet ids of the created tablets. |
801 | 4 | vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
802 | 4 | ASSERT_OK(WaitForNumTabletsOnTS(ts, expected_num_tablets_per_ts, timeout, &tablets)); |
803 | | |
804 | 58 | for (const ListTabletsResponsePB::StatusAndSchemaPB& t : tablets) { |
805 | 58 | tablet_ids->push_back(t.tablet_status().tablet_id()); |
806 | 58 | } |
807 | | |
808 | | // Wait until all replicas are up and running. |
809 | 16 | for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { |
810 | 174 | for (const string& tablet_id : *tablet_ids) { |
811 | 174 | ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(), |
812 | 174 | tablet_id, timeout)); |
813 | 174 | } |
814 | 12 | } |
815 | | |
816 | | // Elect leaders on each tablet for term 1. All leaders will be on TS leader_index. |
817 | 4 | const string kLeaderUuid = cluster_->tablet_server(leader_index)->uuid(); |
818 | 58 | for (const string& tablet_id : *tablet_ids) { |
819 | 58 | ASSERT_OK(itest::StartElection(ts_map_[kLeaderUuid].get(), tablet_id, timeout)); |
820 | 58 | } |
821 | | |
822 | 58 | for (const string& tablet_id : *tablet_ids) { |
823 | 58 | TServerDetails* leader_ts = nullptr; |
824 | 58 | ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, timeout, &leader_ts)); |
825 | 58 | ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(3, leader_ts, tablet_id, timeout)); |
826 | 58 | } |
827 | 4 | } |
828 | | |
829 | | // Test that multiple concurrent remote bootstraps do not cause problems. |
830 | | // This is a regression test for KUDU-951, in which concurrent sessions on |
831 | | // multiple tablets between the same remote bootstrap client host and remote |
832 | | // bootstrap source host could corrupt each other. |
833 | 1 | void RemoteBootstrapITest::ConcurrentRemoteBootstraps(YBTableType table_type) { |
834 | 1 | if (!AllowSlowTests()) { |
835 | 1 | LOG(INFO) << "Skipping test in fast-test mode."; |
836 | 1 | return; |
837 | 1 | } |
838 | | |
839 | 0 | std::vector<std::string> ts_flags = { |
840 | 0 | "--enable_leader_failure_detection=false"s, |
841 | 0 | "--log_cache_size_limit_mb=1"s, |
842 | 0 | "--log_segment_size_mb=1"s, |
843 | 0 | "--log_async_preallocate_segments=false"s, |
844 | 0 | "--log_min_segments_to_retain=100"s, |
845 | 0 | "--maintenance_manager_polling_interval_ms=10"s, |
846 | 0 | }; |
847 | 0 | std::vector<std::string> master_flags = { |
848 | 0 | "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s, |
849 | 0 | "--enable_load_balancing=false"s, |
850 | 0 | "--use_create_table_leader_hint=false"s, |
851 | 0 | }; |
852 | 0 | ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags)); |
853 | |
|
854 | 0 | const int kNumTablets = 10; |
855 | 0 | const int kLeaderIndex = 1; |
856 | 0 | const MonoDelta timeout = MonoDelta::FromSeconds(FLAGS_remote_bootstrap_itest_timeout_sec); |
857 | 0 | vector<string> tablet_ids; |
858 | |
|
859 | 0 | CreateTableAssignLeaderAndWaitForTabletServersReady(table_type, |
860 | 0 | TestWorkloadOptions::kDefaultTableName, kNumTablets, kNumTablets, kLeaderIndex, timeout, |
861 | 0 | &tablet_ids); |
862 | |
|
863 | 0 | TestWorkload workload(cluster_.get()); |
864 | 0 | workload.set_write_timeout_millis(10000); |
865 | 0 | workload.set_timeout_allowed(true); |
866 | 0 | workload.set_write_batch_size(10); |
867 | 0 | workload.set_num_write_threads(10); |
868 | 0 | workload.Setup(table_type); |
869 | 0 | workload.Start(); |
870 | 0 | workload.WaitInserted(20000); |
871 | 0 | workload.StopAndJoin(); |
872 | |
|
873 | 0 | for (const string& tablet_id : tablet_ids) { |
874 | 0 | ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); |
875 | 0 | } |
876 | | |
877 | | // Now pause the leader so we can tombstone the tablets. |
878 | 0 | ASSERT_OK(cluster_->tablet_server(kLeaderIndex)->Pause()); |
879 | |
|
880 | 0 | const int kTsIndex = 0; // We'll test with the first TS. |
881 | 0 | TServerDetails* target_ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()].get(); |
882 | |
|
883 | 0 | for (const string& tablet_id : tablet_ids) { |
884 | 0 | LOG(INFO) << "Tombstoning tablet " << tablet_id << " on TS " << target_ts->uuid(); |
885 | 0 | ASSERT_OK(itest::DeleteTablet(target_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, |
886 | 0 | MonoDelta::FromSeconds(10))); |
887 | 0 | } |
888 | | |
889 | | // Unpause the leader TS and wait for it to remotely bootstrap the tombstoned |
890 | | // tablets, in parallel. |
891 | 0 | ASSERT_OK(cluster_->tablet_server(kLeaderIndex)->Resume()); |
892 | 0 | for (const string& tablet_id : tablet_ids) { |
893 | 0 | ASSERT_OK(itest::WaitUntilTabletRunning(target_ts, tablet_id, timeout)); |
894 | 0 | } |
895 | |
|
896 | 0 | ClusterVerifier cluster_verifier(cluster_.get()); |
897 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckCluster()); |
898 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST, |
899 | 0 | workload.rows_inserted())); |
900 | 0 | } |
901 | | |
902 | 1 | TEST_F(RemoteBootstrapITest, TestLimitNumberOfConcurrentRemoteBootstraps) { |
903 | 1 | constexpr int kMaxConcurrentTabletRemoteBootstrapSessions = 5; |
904 | 1 | constexpr int kMaxConcurrentTabletRemoteBootstrapSessionsPerTable = 2; |
905 | | |
906 | 1 | vector<string> ts_flags; |
907 | 1 | int follower_considered_failed_sec; |
908 | 1 | follower_considered_failed_sec = 10; |
909 | 1 | ts_flags.push_back("--follower_unavailable_considered_failed_sec="+ |
910 | 1 | std::to_string(follower_considered_failed_sec)); |
911 | 1 | ts_flags.push_back("--heartbeat_interval_ms=100"); |
912 | 1 | ts_flags.push_back("--enable_leader_failure_detection=false"); |
913 | 1 | ts_flags.push_back("--TEST_crash_if_remote_bootstrap_sessions_greater_than=" + |
914 | 1 | std::to_string(kMaxConcurrentTabletRemoteBootstrapSessions + 1)); |
915 | 1 | ts_flags.push_back("--TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than=" + |
916 | 1 | std::to_string(kMaxConcurrentTabletRemoteBootstrapSessionsPerTable + 1)); |
917 | 1 | ts_flags.push_back("--TEST_simulate_long_remote_bootstrap_sec=5"); |
918 | | |
919 | 1 | std::vector<std::string> master_flags = { |
920 | 1 | "--TEST_load_balancer_handle_under_replicated_tablets_only=true"s, |
921 | 1 | "--load_balancer_max_concurrent_tablet_remote_bootstraps=" + |
922 | 1 | std::to_string(kMaxConcurrentTabletRemoteBootstrapSessions), |
923 | 1 | "--load_balancer_max_concurrent_tablet_remote_bootstraps_per_table=" + |
924 | 1 | std::to_string(kMaxConcurrentTabletRemoteBootstrapSessionsPerTable), |
925 | 1 | "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s, |
926 | | // This value has to be less than follower_considered_failed_sec. |
927 | 1 | "--tserver_unresponsive_timeout_ms=8000"s, |
928 | 1 | "--use_create_table_leader_hint=false"s, |
929 | 1 | }; |
930 | | |
931 | 1 | ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags)); |
932 | | |
933 | 1 | const MonoDelta timeout = MonoDelta::FromSeconds(FLAGS_remote_bootstrap_itest_timeout_sec); |
934 | 1 | const int kLeaderIndex = 1; |
935 | 1 | const int kNumberTables = 3; |
936 | 1 | const int kNumTablets = 8; |
937 | 1 | vector<vector<string>> tablet_ids; |
938 | | |
939 | 4 | for (int i = 0; i < kNumberTables; i++) { |
940 | 3 | tablet_ids.push_back(vector<string>()); |
941 | 3 | std::string table_name_str = "table_test_" + std::to_string(i); |
942 | 3 | std::string keyspace_name_str = "keyspace_test_" + std::to_string(i); |
943 | 3 | YBTableName table_name(YQL_DATABASE_CQL, keyspace_name_str, table_name_str); |
944 | 3 | CreateTableAssignLeaderAndWaitForTabletServersReady(YBTableType::YQL_TABLE_TYPE, |
945 | 3 | table_name, kNumTablets, (i + 1) * kNumTablets, kLeaderIndex, timeout, &(tablet_ids[i])); |
946 | 3 | } |
947 | | |
948 | 1 | const int kTsIndex = 0; // We'll test with the first TS. |
949 | | |
950 | | // Now pause the first tserver so that it gets removed from the configuration for all of the |
951 | | // tablets. |
952 | 1 | ASSERT_OK(cluster_->tablet_server(kTsIndex)->Pause()); |
953 | 1 | LOG(INFO) << "Paused tserver " << cluster_->tablet_server(kTsIndex)->uuid(); |
954 | | |
955 | | // Sleep for longer than FLAGS_follower_unavailable_considered_failed_sec to guarantee that the |
956 | | // other peers in the config for each tablet removes this tserver from the raft config. |
957 | 1 | int total_time_slept = 0; |
958 | 21 | while (total_time_slept < follower_considered_failed_sec * 2) { |
959 | 20 | SleepFor(MonoDelta::FromSeconds(1)); |
960 | 20 | total_time_slept++; |
961 | 20 | LOG(INFO) << "Total time slept " << total_time_slept; |
962 | 20 | } |
963 | | |
964 | | |
965 | | // Resume the tserver. The cluster balancer will ensure that all the tablets are added back to |
966 | | // this tserver, and it will cause the leader to start remote bootstrap sessions for all of the |
967 | | // tablets. FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than will make sure that we |
968 | | // never have more than the expected number of concurrent remote bootstrap sessions. |
969 | 1 | ASSERT_OK(cluster_->tablet_server(kTsIndex)->Resume()); |
970 | | |
971 | 1 | LOG(INFO) << "Tserver " << cluster_->tablet_server(kTsIndex)->uuid() << " resumed"; |
972 | | |
973 | | // Wait until the config for all the tablets have three voters. This means that the tserver that |
974 | | // we just resumed was remote bootstrapped correctly. |
975 | 4 | for (int i = 0; i < kNumberTables; i++) { |
976 | 48 | for (const string& tablet_id : tablet_ids[i]) { |
977 | 48 | TServerDetails* leader_ts = nullptr; |
978 | 48 | ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, timeout, &leader_ts)); |
979 | 48 | LOG(INFO) << "Waiting until config has 3 voters for tablet " << tablet_id; |
980 | 48 | ASSERT_OK(itest::WaitUntilCommittedConfigNumVotersIs(3, leader_ts, tablet_id, timeout)); |
981 | 48 | } |
982 | 3 | } |
983 | 1 | } |
984 | | |
985 | | // Test that repeatedly runs a load, tombstones a follower, then tombstones the |
986 | | // leader while the follower is remotely bootstrapping. Regression test for |
987 | | // KUDU-1047. |
988 | 1 | void RemoteBootstrapITest::DeleteLeaderDuringRemoteBootstrapStressTest(YBTableType table_type) { |
989 | | // This test takes a while due to failure detection. |
990 | 1 | if (!AllowSlowTests()) { |
991 | 1 | LOG(INFO) << "Skipping test in fast-test mode."; |
992 | 1 | return; |
993 | 1 | } |
994 | | |
995 | 0 | const MonoDelta timeout = MonoDelta::FromSeconds(FLAGS_remote_bootstrap_itest_timeout_sec); |
996 | 0 | vector<string> master_flags; |
997 | 0 | master_flags.push_back("--replication_factor=5"); |
998 | 0 | ASSERT_NO_FATALS(StartCluster(vector<string>(), master_flags, 5)); |
999 | |
|
1000 | 0 | TestWorkload workload(cluster_.get()); |
1001 | 0 | workload.set_payload_bytes(FLAGS_test_delete_leader_payload_bytes); |
1002 | 0 | workload.set_num_write_threads(FLAGS_test_delete_leader_num_writer_threads); |
1003 | 0 | workload.set_write_batch_size(1); |
1004 | 0 | workload.set_write_timeout_millis(10000); |
1005 | 0 | workload.set_timeout_allowed(true); |
1006 | 0 | workload.set_not_found_allowed(true); |
1007 | 0 | workload.set_sequential_write(true); |
1008 | 0 | workload.Setup(table_type); |
1009 | | |
1010 | | // Figure out the tablet id. |
1011 | 0 | const int kTsIndex = 0; |
1012 | 0 | TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()].get(); |
1013 | 0 | vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
1014 | 0 | ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets)); |
1015 | 0 | string tablet_id = tablets[0].tablet_status().tablet_id(); |
1016 | | |
1017 | | // Wait until all replicas are up and running. |
1018 | 0 | for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { |
1019 | 0 | ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(), |
1020 | 0 | tablet_id, timeout)); |
1021 | 0 | } |
1022 | |
|
1023 | 0 | int leader_index = -1; |
1024 | 0 | int follower_index = -1; |
1025 | 0 | TServerDetails* leader_ts = nullptr; |
1026 | 0 | TServerDetails* follower_ts = nullptr; |
1027 | |
|
1028 | 0 | for (int i = 0; i < FLAGS_test_delete_leader_num_iters; i++) { |
1029 | 0 | LOG(INFO) << "Iteration " << (i + 1); |
1030 | 0 | auto rows_previously_inserted = workload.rows_inserted(); |
1031 | | |
1032 | | // Find out who's leader. |
1033 | 0 | ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, timeout, &leader_ts)); |
1034 | 0 | leader_index = cluster_->tablet_server_index_by_uuid(leader_ts->uuid()); |
1035 | | |
1036 | | // Select an arbitrary follower. |
1037 | 0 | follower_index = (leader_index + 1) % cluster_->num_tablet_servers(); |
1038 | 0 | follower_ts = ts_map_[cluster_->tablet_server(follower_index)->uuid()].get(); |
1039 | | |
1040 | | // Spin up the workload. |
1041 | 0 | workload.Start(); |
1042 | 0 | while (workload.rows_inserted() < rows_previously_inserted + |
1043 | 0 | FLAGS_test_delete_leader_min_rows_per_iter) { |
1044 | 0 | SleepFor(MonoDelta::FromMilliseconds(10)); |
1045 | 0 | } |
1046 | | |
1047 | | // Tombstone the follower. |
1048 | 0 | LOG(INFO) << "Tombstoning follower tablet " << tablet_id << " on TS " << follower_ts->uuid(); |
1049 | 0 | ASSERT_OK(itest::DeleteTablet(follower_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, |
1050 | 0 | timeout)); |
1051 | | |
1052 | | // Wait for remote bootstrap to start. |
1053 | | // ENG-81: There is a frequent race condition here: if the bootstrap happens too quickly, we can |
1054 | | // see TABLET_DATA_READY right away without seeing TABLET_DATA_COPYING first (at last that's a |
1055 | | // working hypothesis of an explanation). In an attempt to remedy this, we have increased the |
1056 | | // number of rows inserted per iteration from 20 to 200. |
1057 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(follower_index, tablet_id, |
1058 | 0 | tablet::TABLET_DATA_COPYING, timeout)); |
1059 | | |
1060 | | // Tombstone the leader. |
1061 | 0 | LOG(INFO) << "Tombstoning leader tablet " << tablet_id << " on TS " << leader_ts->uuid(); |
1062 | 0 | ASSERT_OK(itest::DeleteTablet(leader_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, |
1063 | 0 | timeout)); |
1064 | | |
1065 | | // Quiesce and rebuild to full strength. This involves electing a new |
1066 | | // leader from the remaining three, which requires a unanimous vote, and |
1067 | | // that leader then remotely bootstrapping the old leader. |
1068 | 0 | workload.StopAndJoin(); |
1069 | 0 | ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); |
1070 | 0 | } |
1071 | |
|
1072 | 0 | ClusterVerifier cluster_verifier(cluster_.get()); |
1073 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckCluster()); |
1074 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST, |
1075 | 0 | workload.rows_inserted())); |
1076 | 0 | } |
1077 | | |
1078 | | namespace { |
1079 | 0 | int64_t CountUpdateConsensusCalls(ExternalTabletServer* ets, const string& tablet_id) { |
1080 | 0 | return CHECK_RESULT(ets->GetInt64Metric( |
1081 | 0 | &METRIC_ENTITY_server, |
1082 | 0 | "yb.tabletserver", |
1083 | 0 | &METRIC_handler_latency_yb_consensus_ConsensusService_UpdateConsensus, |
1084 | 0 | "total_count")); |
1085 | 0 | } |
1086 | 0 | int64_t CountLogMessages(ExternalTabletServer* ets) { |
1087 | 0 | int64_t total = 0; |
1088 | |
|
1089 | 0 | total += CHECK_RESULT(ets->GetInt64Metric( |
1090 | 0 | &METRIC_ENTITY_server, |
1091 | 0 | "yb.tabletserver", |
1092 | 0 | &METRIC_glog_info_messages, |
1093 | 0 | "value")); |
1094 | |
|
1095 | 0 | total += CHECK_RESULT(ets->GetInt64Metric( |
1096 | 0 | &METRIC_ENTITY_server, |
1097 | 0 | "yb.tabletserver", |
1098 | 0 | &METRIC_glog_warning_messages, |
1099 | 0 | "value")); |
1100 | |
|
1101 | 0 | total += CHECK_RESULT(ets->GetInt64Metric( |
1102 | 0 | &METRIC_ENTITY_server, |
1103 | 0 | "yb.tabletserver", |
1104 | 0 | &METRIC_glog_error_messages, |
1105 | 0 | "value")); |
1106 | |
|
1107 | 0 | return total; |
1108 | 0 | } |
1109 | | } // anonymous namespace |
1110 | | |
1111 | | // Test that if remote bootstrap is disabled by a flag, we don't get into |
1112 | | // tight loops after a tablet is deleted. This is a regression test for situation |
1113 | | // similar to the bug described in KUDU-821: we were previously handling a missing |
1114 | | // tablet within consensus in such a way that we'd immediately send another RPC. |
1115 | | void RemoteBootstrapITest::DisableRemoteBootstrap_NoTightLoopWhenTabletDeleted( |
1116 | 1 | YBTableType table_type) { |
1117 | | |
1118 | 1 | MonoDelta timeout = MonoDelta::FromSeconds(10); |
1119 | 1 | std::vector<string> ts_flags = { |
1120 | 1 | "--enable_leader_failure_detection=false"s, |
1121 | 1 | "--TEST_enable_remote_bootstrap=false"s, |
1122 | 1 | "--rpc_slow_query_threshold_ms=10000000"s, |
1123 | 1 | }; |
1124 | 1 | std::vector<std::string> master_flags = { |
1125 | 1 | "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s, |
1126 | 1 | "--use_create_table_leader_hint=false"s, |
1127 | 1 | }; |
1128 | 1 | ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags)); |
1129 | | |
1130 | 1 | TestWorkload workload(cluster_.get()); |
1131 | | // TODO(KUDU-1054): the client should handle retrying on different replicas |
1132 | | // if the tablet isn't found, rather than giving us this error. |
1133 | 1 | workload.set_not_found_allowed(true); |
1134 | 1 | workload.set_write_batch_size(1); |
1135 | 1 | workload.Setup(table_type); |
1136 | | |
1137 | | // Figure out the tablet id of the created tablet. |
1138 | 1 | vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
1139 | 1 | ExternalTabletServer* replica_ets = cluster_->tablet_server(1); |
1140 | 1 | TServerDetails* replica_ts = ts_map_[replica_ets->uuid()].get(); |
1141 | 1 | ASSERT_OK(WaitForNumTabletsOnTS(replica_ts, 1, timeout, &tablets)); |
1142 | 0 | string tablet_id = tablets[0].tablet_status().tablet_id(); |
1143 | | |
1144 | | // Wait until all replicas are up and running. |
1145 | 0 | for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { |
1146 | 0 | ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(), |
1147 | 0 | tablet_id, timeout)); |
1148 | 0 | } |
1149 | | |
1150 | | // Elect a leader (TS 0) |
1151 | 0 | ExternalTabletServer* leader_ts = cluster_->tablet_server(0); |
1152 | 0 | ASSERT_OK(itest::StartElection(ts_map_[leader_ts->uuid()].get(), tablet_id, timeout)); |
1153 | | |
1154 | | // Start writing, wait for some rows to be inserted. |
1155 | 0 | workload.Start(); |
1156 | 0 | workload.WaitInserted(100); |
1157 | | |
1158 | | // Tombstone the tablet on one of the servers (TS 1) |
1159 | 0 | ASSERT_OK(itest::DeleteTablet(replica_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, |
1160 | 0 | timeout)); |
1161 | | |
1162 | | // Ensure that, if we sleep for a second while still doing writes to the leader: |
1163 | | // a) we don't spew logs on the leader side |
1164 | | // b) we don't get hit with a lot of UpdateConsensus calls on the replica. |
1165 | 0 | int64_t num_update_rpcs_initial = CountUpdateConsensusCalls(replica_ets, tablet_id); |
1166 | 0 | int64_t num_logs_initial = CountLogMessages(leader_ts); |
1167 | |
|
1168 | 0 | SleepFor(MonoDelta::FromSeconds(1)); |
1169 | 0 | int64_t num_update_rpcs_after_sleep = CountUpdateConsensusCalls(replica_ets, tablet_id); |
1170 | 0 | int64_t num_logs_after_sleep = CountLogMessages(leader_ts); |
1171 | | |
1172 | | // Calculate rate per second of RPCs and log messages |
1173 | 0 | int64_t update_rpcs_per_second = num_update_rpcs_after_sleep - num_update_rpcs_initial; |
1174 | 0 | EXPECT_LT(update_rpcs_per_second, 20); |
1175 | 0 | int64_t num_logs_per_second = num_logs_after_sleep - num_logs_initial; |
1176 | 0 | EXPECT_LT(num_logs_per_second, 20); |
1177 | 0 | } |
1178 | | |
1179 | 1 | void RemoteBootstrapITest::LeaderCrashesWhileFetchingData(YBTableType table_type) { |
1180 | 1 | crash_test_timeout_ = MonoDelta::FromSeconds(40); |
1181 | 1 | CrashTestSetUp(table_type); |
1182 | | |
1183 | | // Cause the leader to crash when a follower tries to fetch data from it. |
1184 | 1 | const string& fault_flag = "TEST_fault_crash_on_handle_rb_fetch_data"; |
1185 | 1 | ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(crash_test_leader_index_), fault_flag, |
1186 | 1 | "1.0")); |
1187 | | |
1188 | | // Add our TS 0 to the config and wait for the leader to crash. |
1189 | 0 | ASSERT_OK(cluster_->tablet_server(crash_test_tserver_index_)->Restart()); |
1190 | 0 | TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()].get(); |
1191 | |
|
1192 | 0 | ASSERT_OK(itest::AddServer(crash_test_leader_ts_, crash_test_tablet_id_, ts, |
1193 | 0 | PeerMemberType::PRE_VOTER, boost::none, crash_test_timeout_, |
1194 | 0 | NULL /* error code */, |
1195 | 0 | true /* retry */)); |
1196 | |
|
1197 | 0 | ASSERT_OK(cluster_->WaitForTSToCrash(crash_test_leader_index_, kWaitForCrashTimeout_)); |
1198 | |
|
1199 | 0 | CrashTestVerify(); |
1200 | 0 | } |
1201 | | |
1202 | 1 | void RemoteBootstrapITest::LeaderCrashesBeforeChangeRole(YBTableType table_type) { |
1203 | | // Make the tablet server sleep in LogAndTombstone after it has called DeleteTabletData so we can |
1204 | | // verify that the tablet has been tombstoned (by calling WaitForTabletDataStateOnTs). |
1205 | 1 | crash_test_tserver_flags_.push_back("--TEST_sleep_after_tombstoning_tablet_secs=5"); |
1206 | 1 | crash_test_timeout_ = MonoDelta::FromSeconds(40); |
1207 | 1 | CrashTestSetUp(table_type); |
1208 | | |
1209 | | // Cause the leader to crash when the follower ends the remote bootstrap session and just before |
1210 | | // the leader is about to change the role of the follower. |
1211 | 1 | const string& fault_flag = "TEST_fault_crash_leader_before_changing_role"; |
1212 | 1 | ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(crash_test_leader_index_), fault_flag, |
1213 | 1 | "1.0")); |
1214 | | |
1215 | | // Add our TS 0 to the config and wait for the leader to crash. |
1216 | 0 | ASSERT_OK(cluster_->tablet_server(crash_test_tserver_index_)->Restart()); |
1217 | 0 | TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()].get(); |
1218 | 0 | ASSERT_OK(itest::AddServer(crash_test_leader_ts_, crash_test_tablet_id_, ts, |
1219 | 0 | PeerMemberType::PRE_VOTER, boost::none, crash_test_timeout_)); |
1220 | 0 | ASSERT_OK(cluster_->WaitForTSToCrash(crash_test_leader_index_, kWaitForCrashTimeout_)); |
1221 | 0 | CrashTestVerify(); |
1222 | 0 | } |
1223 | | |
1224 | 1 | void RemoteBootstrapITest::LeaderCrashesAfterChangeRole(YBTableType table_type) { |
1225 | | // Make the tablet server sleep in LogAndTombstone after it has called DeleteTabletData so we can |
1226 | | // verify that the tablet has been tombstoned (by calling WaitForTabletDataStateOnTs). |
1227 | 1 | crash_test_tserver_flags_.push_back("--TEST_sleep_after_tombstoning_tablet_secs=5"); |
1228 | 1 | crash_test_timeout_ = MonoDelta::FromSeconds(40); |
1229 | 1 | CrashTestSetUp(table_type); |
1230 | | |
1231 | | // Cause the leader to crash after it has successfully sent a ChangeConfig CHANGE_ROLE request and |
1232 | | // before it responds to the EndRemoteBootstrapSession request. |
1233 | 1 | const string& fault_flag = "TEST_fault_crash_leader_after_changing_role"; |
1234 | 1 | ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(crash_test_leader_index_), fault_flag, |
1235 | 1 | "1.0")); |
1236 | | |
1237 | | // Add our TS 0 to the config and wait for the leader to crash. |
1238 | 0 | ASSERT_OK(cluster_->tablet_server(crash_test_tserver_index_)->Restart()); |
1239 | 0 | TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()].get(); |
1240 | 0 | ASSERT_OK(itest::AddServer(crash_test_leader_ts_, crash_test_tablet_id_, ts, |
1241 | 0 | PeerMemberType::PRE_VOTER, boost::none, crash_test_timeout_)); |
1242 | 0 | ASSERT_OK(cluster_->WaitForTSToCrash(crash_test_leader_index_, kWaitForCrashTimeout_)); |
1243 | |
|
1244 | 0 | CrashTestVerify(); |
1245 | 0 | } |
1246 | | |
1247 | 1 | void RemoteBootstrapITest::ClientCrashesBeforeChangeRole(YBTableType table_type) { |
1248 | 1 | crash_test_timeout_ = MonoDelta::FromSeconds(40); |
1249 | 1 | crash_test_tserver_flags_.push_back("--TEST_return_error_on_change_config=0.60"); |
1250 | 1 | CrashTestSetUp(table_type); |
1251 | | |
1252 | | // Add our TS 0 to the config and wait for it to crash. |
1253 | 1 | ASSERT_OK(cluster_->tablet_server(crash_test_tserver_index_)->Restart()); |
1254 | | // Cause the newly added tserver to crash after the transfer of files for remote bootstrap has |
1255 | | // completed but before ending the session with the leader to avoid triggering a ChangeConfig |
1256 | | // in the leader. |
1257 | 0 | const string& fault_flag = "TEST_fault_crash_bootstrap_client_before_changing_role"; |
1258 | 0 | ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(crash_test_tserver_index_), fault_flag, |
1259 | 0 | "1.0")); |
1260 | |
|
1261 | 0 | TServerDetails* ts = ts_map_[cluster_->tablet_server(crash_test_tserver_index_)->uuid()].get(); |
1262 | 0 | ASSERT_OK(itest::AddServer(crash_test_leader_ts_, crash_test_tablet_id_, ts, |
1263 | 0 | PeerMemberType::PRE_VOTER, boost::none, crash_test_timeout_)); |
1264 | |
|
1265 | 0 | ASSERT_OK(cluster_->WaitForTSToCrash(crash_test_tserver_index_, kWaitForCrashTimeout_)); |
1266 | |
|
1267 | 0 | LOG(INFO) << "Restarting TS " << cluster_->tablet_server(crash_test_tserver_index_)->uuid(); |
1268 | 0 | cluster_->tablet_server(crash_test_tserver_index_)->Shutdown(); |
1269 | 0 | ASSERT_OK(cluster_->tablet_server(crash_test_tserver_index_)->Restart()); |
1270 | |
|
1271 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(crash_test_tserver_index_, crash_test_tablet_id_, |
1272 | 0 | TABLET_DATA_READY, crash_test_timeout_)); |
1273 | |
|
1274 | 0 | ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(5, crash_test_leader_ts_, crash_test_tablet_id_, |
1275 | 0 | crash_test_timeout_)); |
1276 | |
|
1277 | 0 | ClusterVerifier cluster_verifier(cluster_.get()); |
1278 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckCluster()); |
1279 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(crash_test_workload_->table_name(), |
1280 | 0 | ClusterVerifier::AT_LEAST, |
1281 | 0 | crash_test_workload_->rows_inserted())); |
1282 | | |
1283 | | |
1284 | | // We know that the remote bootstrap checkpoint on the leader hasn't been cleared because it |
1285 | | // hasn't restarted. So we don't want to check that it's empty. |
1286 | 0 | check_checkpoints_cleared_ = false; |
1287 | 0 | } |
1288 | | |
1289 | 1 | TEST_F(RemoteBootstrapITest, TestVeryLongRemoteBootstrap) { |
1290 | 1 | vector<string> ts_flags, master_flags; |
1291 | | |
1292 | | // Make everything happen 100x faster: |
1293 | | // - follower_unavailable_considered_failed_sec from 300 to 3 secs |
1294 | | // - raft_heartbeat_interval_ms from 500 to 5 ms |
1295 | | // - consensus_rpc_timeout_ms from 3000 to 30 ms |
1296 | | |
1297 | 1 | ts_flags.push_back("--follower_unavailable_considered_failed_sec=3"); |
1298 | 1 | ts_flags.push_back("--raft_heartbeat_interval_ms=5"); |
1299 | 1 | ts_flags.push_back("--consensus_rpc_timeout_ms=30"); |
1300 | | |
1301 | | // Increase the number of missed heartbeats used to detect leader failure since in slow testing |
1302 | | // instances it is very easy to miss the default (6) heartbeats since they are being sent very |
1303 | | // fast (5ms). |
1304 | 1 | ts_flags.push_back("--leader_failure_max_missed_heartbeat_periods=40.0"); |
1305 | | |
1306 | | // Make the remote bootstrap take longer than follower_unavailable_considered_failed_sec seconds |
1307 | | // so the peer gets removed from the config while it is being remote bootstrapped. |
1308 | 1 | ts_flags.push_back("--TEST_simulate_long_remote_bootstrap_sec=5"); |
1309 | | |
1310 | 1 | master_flags.push_back("--enable_load_balancing=false"); |
1311 | | |
1312 | 1 | ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags, 4)); |
1313 | | |
1314 | | // We'll do a config change to remote bootstrap a replica here later. For now, shut it down. |
1315 | 1 | auto constexpr kTsIndex = 0; |
1316 | 1 | LOG(INFO) << "Shutting down TS " << cluster_->tablet_server(kTsIndex)->uuid(); |
1317 | 1 | cluster_->tablet_server(kTsIndex)->Shutdown(); |
1318 | 1 | auto new_ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()].get(); |
1319 | | |
1320 | | // Bounce the Master so it gets new tablet reports and doesn't try to assign a replica to the |
1321 | | // dead TS. |
1322 | 1 | const auto timeout = MonoDelta::FromSeconds(40); |
1323 | 1 | cluster_->master()->Shutdown(); |
1324 | 1 | LOG(INFO) << "Restarting master " << cluster_->master()->uuid(); |
1325 | 1 | ASSERT_OK(cluster_->master()->Restart()); |
1326 | 1 | ASSERT_OK(cluster_->WaitForTabletServerCount(3, timeout)); |
1327 | | |
1328 | | // Populate a tablet with some data. |
1329 | 1 | LOG(INFO) << "Starting workload"; |
1330 | 1 | TestWorkload workload(cluster_.get()); |
1331 | 1 | workload.set_sequential_write(true); |
1332 | 1 | workload.Setup(YBTableType::YQL_TABLE_TYPE); |
1333 | 1 | workload.Start(); |
1334 | 1 | workload.WaitInserted(10); |
1335 | 1 | LOG(INFO) << "Stopping workload"; |
1336 | 1 | workload.StopAndJoin(); |
1337 | | |
1338 | | // Figure out the tablet id of the created tablet. |
1339 | 1 | vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
1340 | 1 | TServerDetails* ts = ts_map_[cluster_->tablet_server(1)->uuid()].get(); |
1341 | 1 | ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets)); |
1342 | 0 | string tablet_id = tablets[0].tablet_status().tablet_id(); |
1343 | |
|
1344 | 0 | TServerDetails* leader_ts; |
1345 | | // Find out who's leader. |
1346 | 0 | ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, timeout, &leader_ts)); |
1347 | | |
1348 | | // Add back TS0. |
1349 | 0 | ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart()); |
1350 | 0 | LOG(INFO) << "Adding tserver with uuid " << new_ts->uuid(); |
1351 | 0 | ASSERT_OK(itest::AddServer(leader_ts, tablet_id, new_ts, PeerMemberType::PRE_VOTER, boost::none, |
1352 | 0 | timeout)); |
1353 | | // After adding new_ts, the leader will detect that TS0 needs to be remote bootstrapped. Verify |
1354 | | // that this process completes successfully. |
1355 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY)); |
1356 | 0 | LOG(INFO) << "Tablet " << tablet_id << " in state TABLET_DATA_READY in tablet server " |
1357 | 0 | << new_ts->uuid(); |
1358 | |
|
1359 | 0 | ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(4, leader_ts, tablet_id, timeout)); |
1360 | 0 | LOG(INFO) << "Number of voters for tablet " << tablet_id << " is 4"; |
1361 | | |
1362 | | // Ensure all the servers agree before we proceed. |
1363 | 0 | ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed())); |
1364 | |
|
1365 | 0 | ClusterVerifier cluster_verifier(cluster_.get()); |
1366 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckCluster()); |
1367 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), |
1368 | 0 | ClusterVerifier::AT_LEAST, |
1369 | 0 | workload.rows_inserted())); |
1370 | 0 | } |
1371 | | |
1372 | 1 | TEST_F(RemoteBootstrapITest, TestFailedTabletIsRemoteBootstrapped) { |
1373 | 1 | std::vector<std::string> ts_flags = { |
1374 | 1 | "--follower_unavailable_considered_failed_sec=30", |
1375 | 1 | "--raft_heartbeat_interval_ms=50", |
1376 | 1 | "--consensus_rpc_timeout_ms=300", |
1377 | 1 | "--TEST_delay_removing_peer_with_failed_tablet_secs=10", |
1378 | 1 | "--memstore_size_mb=1", |
1379 | | // Increase the number of missed heartbeats used to detect leader failure since in slow |
1380 | | // testing instances it is very easy to miss the default (6) heartbeats since they are being |
1381 | | // sent very fast (50ms). |
1382 | 1 | "--leader_failure_max_missed_heartbeat_periods=40.0" |
1383 | 1 | }; |
1384 | | |
1385 | 1 | std::vector<std::string> master_flags = {"--enable_load_balancing=true"}; |
1386 | | |
1387 | 1 | ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags, 3)); |
1388 | | |
1389 | 1 | const auto kTimeout = 40s; |
1390 | 1 | ASSERT_OK(cluster_->WaitForTabletServerCount(3, kTimeout)); |
1391 | | |
1392 | | // Populate a tablet with some data. |
1393 | 1 | LOG(INFO) << "Starting workload"; |
1394 | 1 | TestWorkload workload(cluster_.get()); |
1395 | 1 | workload.set_sequential_write(true); |
1396 | 1 | workload.Setup(YBTableType::YQL_TABLE_TYPE); |
1397 | 1 | workload.set_payload_bytes(1024); |
1398 | 1 | workload.Start(); |
1399 | 1 | workload.WaitInserted(5000); |
1400 | 1 | LOG(INFO) << "Stopping workload"; |
1401 | 1 | workload.StopAndJoin(); |
1402 | | |
1403 | | // Figure out the tablet id of the created tablet. |
1404 | 1 | vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
1405 | 1 | TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()].get(); |
1406 | 1 | ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, kTimeout, &tablets)); |
1407 | 0 | string tablet_id = tablets[0].tablet_status().tablet_id(); |
1408 | |
|
1409 | 0 | TServerDetails* leader_ts; |
1410 | | // Find out who's leader. |
1411 | 0 | ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader_ts)); |
1412 | |
|
1413 | 0 | TServerDetails* non_leader_ts = nullptr; |
1414 | 0 | int non_leader_idx = -1; |
1415 | | // Find the first non-leader TS. |
1416 | 0 | for (int i = 0; i < 3; i++) { |
1417 | 0 | if (cluster_->tablet_server(i)->uuid() != leader_ts->uuid()) { |
1418 | 0 | non_leader_ts = ts_map_[cluster_->tablet_server(i)->uuid()].get(); |
1419 | 0 | non_leader_idx = i; |
1420 | 0 | break; |
1421 | 0 | } |
1422 | 0 | } |
1423 | |
|
1424 | 0 | ASSERT_NE(non_leader_ts, nullptr); |
1425 | 0 | ASSERT_NE(non_leader_idx, -1); |
1426 | |
|
1427 | 0 | ASSERT_OK(WaitUntilTabletInState(non_leader_ts, tablet_id, tablet::RUNNING, kTimeout)); |
1428 | |
|
1429 | 0 | auto* env = Env::Default(); |
1430 | 0 | const string data_dir = cluster_->tablet_server_by_uuid(non_leader_ts->uuid())->GetDataDirs()[0]; |
1431 | 0 | auto meta_dir = FsManager::GetRaftGroupMetadataDir(data_dir); |
1432 | 0 | auto metadata_path = JoinPathSegments(meta_dir, tablet_id); |
1433 | 0 | tablet::RaftGroupReplicaSuperBlockPB superblock; |
1434 | 0 | ASSERT_OK(pb_util::ReadPBContainerFromPath(env, metadata_path, &superblock)); |
1435 | 0 | string tablet_data_dir = superblock.kv_store().rocksdb_dir(); |
1436 | 0 | const auto& rocksdb_files = ASSERT_RESULT(env->GetChildren(tablet_data_dir, ExcludeDots::kTrue)); |
1437 | 0 | ASSERT_GT(rocksdb_files.size(), 0); |
1438 | 0 | for (const auto& file : rocksdb_files) { |
1439 | 0 | if (file.size() > 4 && file.substr(file.size() - 4) == ".sst") { |
1440 | 0 | ASSERT_OK(env->DeleteFile(JoinPathSegments(tablet_data_dir, file))); |
1441 | 0 | LOG(INFO) << "Deleted file " << JoinPathSegments(tablet_data_dir, file); |
1442 | 0 | } |
1443 | 0 | } |
1444 | | |
1445 | | // Restart the tserver so that the tablet gets marked as FAILED when it's bootstrapped. |
1446 | | // Flag TEST_delay_removing_peer_with_failed_tablet_secs will keep the tablet in the FAILED state |
1447 | | // for the specified amount of time so that we can verify that it was indeed marked as FAILED. |
1448 | 0 | cluster_->tablet_server_by_uuid(non_leader_ts->uuid())->Shutdown(); |
1449 | 0 | ASSERT_OK(cluster_->tablet_server_by_uuid(non_leader_ts->uuid())->Restart()); |
1450 | |
|
1451 | 0 | ASSERT_OK(WaitUntilTabletInState(non_leader_ts, tablet_id, tablet::FAILED, kTimeout, 500ms)); |
1452 | 0 | LOG(INFO) << "Tablet " << tablet_id << " in state FAILED in tablet server " |
1453 | 0 | << non_leader_ts->uuid(); |
1454 | | |
1455 | | // After setting the tablet state to FAILED, the leader will detect that this TS needs to be |
1456 | | // removed from the config so that it can be remote bootstrapped again. Check that this process |
1457 | | // completes successfully. |
1458 | 0 | ASSERT_OK(WaitUntilTabletInState(non_leader_ts, tablet_id, tablet::RUNNING, kTimeout)); |
1459 | 0 | LOG(INFO) << "Tablet " << tablet_id << " in state RUNNING in tablet server " |
1460 | 0 | << non_leader_ts->uuid(); |
1461 | |
|
1462 | 0 | ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(3, leader_ts, tablet_id, kTimeout)); |
1463 | 0 | LOG(INFO) << "Number of voters for tablet " << tablet_id << " is 3"; |
1464 | | |
1465 | | // Ensure all the servers agree before we proceed. |
1466 | 0 | ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map_, tablet_id, workload.batches_completed())); |
1467 | |
|
1468 | 0 | ClusterVerifier cluster_verifier(cluster_.get()); |
1469 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckCluster()); |
1470 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), |
1471 | 0 | ClusterVerifier::AT_LEAST, |
1472 | 0 | workload.rows_inserted())); |
1473 | 0 | } |
1474 | | |
1475 | 1 | TEST_F(RemoteBootstrapITest, TestRejectRogueLeaderKeyValueType) { |
1476 | 1 | RejectRogueLeader(YBTableType::YQL_TABLE_TYPE); |
1477 | 1 | } |
1478 | | |
1479 | 1 | TEST_F(RemoteBootstrapITest, TestDeleteTabletDuringRemoteBootstrapKeyValueType) { |
1480 | 1 | DeleteTabletDuringRemoteBootstrap(YBTableType::YQL_TABLE_TYPE); |
1481 | 1 | } |
1482 | | |
1483 | 1 | TEST_F(RemoteBootstrapITest, TestRemoteBootstrapFollowerWithHigherTermKeyValueType) { |
1484 | 1 | RemoteBootstrapFollowerWithHigherTerm(YBTableType::YQL_TABLE_TYPE); |
1485 | 1 | } |
1486 | | |
1487 | 1 | TEST_F(RemoteBootstrapITest, TestConcurrentRemoteBootstrapsKeyValueType) { |
1488 | 1 | ConcurrentRemoteBootstraps(YBTableType::YQL_TABLE_TYPE); |
1489 | 1 | } |
1490 | | |
1491 | 1 | TEST_F(RemoteBootstrapITest, TestDeleteLeaderDuringRemoteBootstrapStressTestKeyValueType) { |
1492 | 1 | DeleteLeaderDuringRemoteBootstrapStressTest(YBTableType::YQL_TABLE_TYPE); |
1493 | 1 | } |
1494 | | |
1495 | 1 | TEST_F(RemoteBootstrapITest, TestDisableRemoteBootstrap_NoTightLoopWhenTabletDeletedKeyValueType) { |
1496 | 1 | DisableRemoteBootstrap_NoTightLoopWhenTabletDeleted(YBTableType::YQL_TABLE_TYPE); |
1497 | 1 | } |
1498 | | |
1499 | 1 | TEST_F(RemoteBootstrapITest, TestLeaderCrashesWhileFetchingDataKeyValueTableType) { |
1500 | 1 | RemoteBootstrapITest::LeaderCrashesWhileFetchingData(YBTableType::YQL_TABLE_TYPE); |
1501 | 1 | } |
1502 | | |
1503 | 1 | TEST_F(RemoteBootstrapITest, TestLeaderCrashesBeforeChangeRoleKeyValueTableType) { |
1504 | 1 | RemoteBootstrapITest::LeaderCrashesBeforeChangeRole(YBTableType::YQL_TABLE_TYPE); |
1505 | 1 | } |
1506 | | |
1507 | 1 | TEST_F(RemoteBootstrapITest, TestLeaderCrashesAfterChangeRoleKeyValueTableType) { |
1508 | 1 | RemoteBootstrapITest::LeaderCrashesAfterChangeRole(YBTableType::YQL_TABLE_TYPE); |
1509 | 1 | } |
1510 | | |
1511 | 1 | TEST_F(RemoteBootstrapITest, TestClientCrashesBeforeChangeRoleKeyValueTableType) { |
1512 | 1 | RemoteBootstrapITest::ClientCrashesBeforeChangeRole(YBTableType::YQL_TABLE_TYPE); |
1513 | 1 | } |
1514 | | |
1515 | | } // namespace yb |