/Users/deen/code/yugabyte-db/src/yb/integration-tests/tablet_replacement-itest.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | |
33 | | #include <functional> |
34 | | #include <memory> |
35 | | #include <string> |
36 | | #include <unordered_map> |
37 | | |
38 | | #include <boost/optional.hpp> |
39 | | #include <gtest/gtest.h> |
40 | | |
41 | | #include "yb/common/wire_protocol-test-util.h" |
42 | | #include "yb/common/wire_protocol.h" |
43 | | |
44 | | #include "yb/gutil/strings/substitute.h" |
45 | | |
46 | | #include "yb/integration-tests/cluster_verifier.h" |
47 | | #include "yb/integration-tests/external_mini_cluster-itest-base.h" |
48 | | #include "yb/integration-tests/test_workload.h" |
49 | | |
50 | | #include "yb/rpc/rpc_controller.h" |
51 | | |
52 | | #include "yb/tserver/tserver_service.proxy.h" |
53 | | |
54 | | #include "yb/util/countdown_latch.h" |
55 | | |
56 | | using yb::consensus::RaftPeerPB; |
57 | | using yb::consensus::PeerMemberType; |
58 | | using yb::itest::TServerDetails; |
59 | | using yb::tablet::TABLET_DATA_READY; |
60 | | using yb::tablet::TABLET_DATA_TOMBSTONED; |
61 | | using yb::tserver::ListTabletsResponsePB; |
62 | | using std::shared_ptr; |
63 | | using std::string; |
64 | | using std::unordered_map; |
65 | | using std::vector; |
66 | | using strings::Substitute; |
67 | | |
68 | | namespace yb { |
69 | | |
70 | | class TabletReplacementITest : public ExternalMiniClusterITestBase { |
71 | | }; |
72 | | |
73 | | // Test that the Master will tombstone a newly-evicted replica. |
74 | | // Then, test that the Master will NOT tombstone a newly-added replica that is |
75 | | // not part of the committed config yet (only the pending config). |
76 | 1 | TEST_F(TabletReplacementITest, TestMasterTombstoneEvictedReplica) { |
77 | 1 | MonoDelta timeout = MonoDelta::FromSeconds(30); |
78 | 1 | vector<string> ts_flags = { "--enable_leader_failure_detection=false" }; |
79 | 1 | int num_tservers = 5; |
80 | 1 | vector<string> master_flags = { |
81 | 1 | "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s, |
82 | 1 | "--replication_factor=5", |
83 | 1 | "--use_create_table_leader_hint=false"s, |
84 | 1 | }; |
85 | 1 | ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags, num_tservers)); |
86 | | |
87 | 1 | TestWorkload workload(cluster_.get()); |
88 | 1 | workload.Setup(); // Easy way to create a new tablet. |
89 | | |
90 | 1 | const int kLeaderIndex = 0; |
91 | 1 | TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()].get(); |
92 | 1 | const int kFollowerIndex = 4; |
93 | 1 | TServerDetails* follower_ts = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()].get(); |
94 | | |
95 | | // Figure out the tablet id of the created tablet. |
96 | 1 | vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
97 | 1 | ASSERT_OK(itest::WaitForNumTabletsOnTS(leader_ts, 1, timeout, &tablets)); |
98 | 0 | string tablet_id = tablets[0].tablet_status().tablet_id(); |
99 | | |
100 | | // Wait until all replicas are up and running. |
101 | 0 | for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { |
102 | 0 | ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(), |
103 | 0 | tablet_id, timeout)); |
104 | 0 | } |
105 | | |
106 | | // Elect a leader (TS 0) |
107 | 0 | ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout)); |
108 | 0 | ASSERT_OK(itest::WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); // Wait for NO_OP. |
109 | 0 | ASSERT_OK(itest::WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id, timeout)); |
110 | | |
111 | | // Remove a follower from the config. |
112 | 0 | ASSERT_OK(itest::RemoveServer(leader_ts, tablet_id, follower_ts, boost::none, timeout)); |
113 | | |
114 | | // Wait for the Master to tombstone the replica. |
115 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_TOMBSTONED, |
116 | 0 | timeout)); |
117 | |
|
118 | 0 | if (!AllowSlowTests()) { |
119 | | // The rest of this test has multi-second waits, so we do it in slow test mode. |
120 | 0 | LOG(INFO) << "Not verifying that a newly-added replica won't be tombstoned in fast-test mode"; |
121 | 0 | return; |
122 | 0 | } |
123 | | |
124 | | // Shut down a majority of followers (3 servers) and then try to add the |
125 | | // follower back to the config. This will cause the config change to end up |
126 | | // in a pending state. |
127 | 0 | auto active_ts_map = CreateTabletServerMapUnowned(ts_map_); |
128 | 0 | for (int i = 1; i <= 3; i++) { |
129 | 0 | cluster_->tablet_server(i)->Shutdown(); |
130 | 0 | ASSERT_EQ(1, active_ts_map.erase(cluster_->tablet_server(i)->uuid())); |
131 | 0 | } |
132 | | // This will time out, but should take effect. |
133 | 0 | Status s = itest::AddServer(leader_ts, tablet_id, follower_ts, PeerMemberType::PRE_VOTER, |
134 | 0 | boost::none, MonoDelta::FromSeconds(5), NULL, |
135 | 0 | false /* retry */); |
136 | 0 | ASSERT_TRUE(s.IsTimedOut()); |
137 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_READY, |
138 | 0 | timeout)); |
139 | 0 | ASSERT_OK(itest::WaitForServersToAgree(timeout, active_ts_map, tablet_id, 3)); |
140 | | |
141 | | // Sleep for a few more seconds and check again to ensure that the Master |
142 | | // didn't end up tombstoning the replica. |
143 | 0 | SleepFor(MonoDelta::FromSeconds(3)); |
144 | 0 | ASSERT_OK(inspect_->CheckTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_READY)); |
145 | 0 | } |
146 | | |
147 | | // Ensure that the Master will tombstone a replica if it reports in with an old |
148 | | // config. This tests a slightly different code path in the catalog manager |
149 | | // than TestMasterTombstoneEvictedReplica does. |
150 | 1 | TEST_F(TabletReplacementITest, TestMasterTombstoneOldReplicaOnReport) { |
151 | 1 | MonoDelta timeout = MonoDelta::FromSeconds(30); |
152 | 1 | vector<string> ts_flags = { "--enable_leader_failure_detection=false" }; |
153 | 1 | vector<string> master_flags = { |
154 | 1 | "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s, |
155 | 1 | "--use_create_table_leader_hint=false"s, |
156 | 1 | }; |
157 | 1 | ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags)); |
158 | | |
159 | 1 | TestWorkload workload(cluster_.get()); |
160 | 1 | workload.Setup(); // Easy way to create a new tablet. |
161 | | |
162 | 1 | const int kLeaderIndex = 0; |
163 | 1 | TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()].get(); |
164 | 1 | const int kFollowerIndex = 2; |
165 | 1 | TServerDetails* follower_ts = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()].get(); |
166 | | |
167 | | // Figure out the tablet id of the created tablet. |
168 | 1 | vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
169 | 1 | ASSERT_OK(itest::WaitForNumTabletsOnTS(leader_ts, 1, timeout, &tablets)); |
170 | 0 | string tablet_id = tablets[0].tablet_status().tablet_id(); |
171 | | |
172 | | // Wait until all replicas are up and running. |
173 | 0 | for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { |
174 | 0 | ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(), |
175 | 0 | tablet_id, timeout)); |
176 | 0 | } |
177 | | |
178 | | // Elect a leader (TS 0) |
179 | 0 | ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout)); |
180 | 0 | ASSERT_OK(itest::WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); // Wait for NO_OP. |
181 | 0 | ASSERT_OK(itest::WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id, timeout)); |
182 | | |
183 | | // Shut down the follower to be removed, then remove it from the config. |
184 | | // We will wait for the Master to be notified of the config change, then shut |
185 | | // down the rest of the cluster and bring the follower back up. The follower |
186 | | // will heartbeat to the Master and then be tombstoned. |
187 | 0 | cluster_->tablet_server(kFollowerIndex)->Shutdown(); |
188 | | |
189 | | // Remove the follower from the config and wait for the Master to notice the |
190 | | // config change. |
191 | 0 | ASSERT_OK(itest::RemoveServer(leader_ts, tablet_id, follower_ts, boost::none, timeout)); |
192 | 0 | ASSERT_OK(itest::WaitForNumVotersInConfigOnMaster(cluster_.get(), tablet_id, 2, timeout)); |
193 | | |
194 | | // Shut down the remaining tablet servers and restart the dead one. |
195 | 0 | cluster_->tablet_server(0)->Shutdown(); |
196 | 0 | cluster_->tablet_server(1)->Shutdown(); |
197 | 0 | ASSERT_OK(cluster_->tablet_server(kFollowerIndex)->Restart()); |
198 | | |
199 | | // Wait for the Master to tombstone the revived follower. |
200 | 0 | ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_TOMBSTONED, |
201 | 0 | timeout)); |
202 | 0 | } |
203 | | |
204 | | // Test that unreachable followers are evicted and replaced. |
205 | 1 | TEST_F(TabletReplacementITest, TestEvictAndReplaceDeadFollower) { |
206 | 1 | if (!AllowSlowTests()) { |
207 | 1 | LOG(INFO) << "Skipping test in fast-test mode."; |
208 | 1 | return; |
209 | 1 | } |
210 | | |
211 | 0 | MonoDelta timeout = MonoDelta::FromSeconds(30); |
212 | 0 | vector<string> ts_flags = { "--enable_leader_failure_detection=false", |
213 | 0 | "--follower_unavailable_considered_failed_sec=5" }; |
214 | 0 | vector<string> master_flags = { |
215 | 0 | "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s, |
216 | 0 | "--use_create_table_leader_hint=false"s, |
217 | 0 | }; |
218 | 0 | ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags)); |
219 | |
|
220 | 0 | TestWorkload workload(cluster_.get()); |
221 | 0 | workload.Setup(); // Easy way to create a new tablet. |
222 | |
|
223 | 0 | const int kLeaderIndex = 0; |
224 | 0 | TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()].get(); |
225 | 0 | const int kFollowerIndex = 2; |
226 | | |
227 | | // Figure out the tablet id of the created tablet. |
228 | 0 | vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
229 | 0 | ASSERT_OK(itest::WaitForNumTabletsOnTS(leader_ts, 1, timeout, &tablets)); |
230 | 0 | string tablet_id = tablets[0].tablet_status().tablet_id(); |
231 | | |
232 | | // Wait until all replicas are up and running. |
233 | 0 | for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { |
234 | 0 | ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(), |
235 | 0 | tablet_id, timeout)); |
236 | 0 | } |
237 | | |
238 | | // Elect a leader (TS 0) |
239 | 0 | ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout)); |
240 | 0 | ASSERT_OK(itest::WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); // Wait for NO_OP. |
241 | | |
242 | | // Shut down the follower to be removed. It should be evicted. |
243 | 0 | cluster_->tablet_server(kFollowerIndex)->Shutdown(); |
244 | | |
245 | | // With a RemoveServer and AddServer, the opid_index of the committed config will be 3. |
246 | 0 | ASSERT_OK(itest::WaitUntilCommittedOpIdIndexIs(3, |
247 | 0 | leader_ts, |
248 | 0 | tablet_id, |
249 | 0 | timeout, |
250 | 0 | itest::CommittedEntryType::CONFIG)); |
251 | 0 | ASSERT_OK(cluster_->tablet_server(kFollowerIndex)->Restart()); |
252 | 0 | } |
253 | | |
254 | | // Regression test for KUDU-1233. This test creates a situation in which tablet |
255 | | // bootstrap will attempt to replay committed (and applied) config change |
256 | | // operations. This is achieved by delaying application of a write at the |
257 | | // tablet level that precedes the config change operations in the WAL, then |
258 | | // initiating a remote bootstrap to a follower. The follower will not have the |
259 | | // COMMIT for the write operation, so will ignore COMMIT messages for the |
260 | | // applied config change operations. At startup time, the newly |
261 | | // remotely-bootstrapped tablet should detect that these config change |
262 | | // operations have already been applied and skip them. |
263 | 1 | TEST_F(TabletReplacementITest, TestRemoteBoostrapWithPendingConfigChangeCommits) { |
264 | 1 | if (!AllowSlowTests()) { |
265 | 1 | LOG(INFO) << "Skipping test in fast-test mode."; |
266 | 1 | return; |
267 | 1 | } |
268 | | |
269 | 0 | MonoDelta timeout = MonoDelta::FromSeconds(30); |
270 | 0 | vector<string> ts_flags = { |
271 | 0 | "--enable_leader_failure_detection=false"s, |
272 | 0 | }; |
273 | | // We will manage doing the AddServer() manually, in order to make this test |
274 | | // more deterministic. |
275 | 0 | vector<string> master_flags = { |
276 | 0 | "--master_tombstone_evicted_tablet_replicas=false"s, |
277 | 0 | "--catalog_manager_wait_for_new_tablets_to_elect_leader=false"s, |
278 | 0 | "--use_create_table_leader_hint=false"s, |
279 | 0 | }; |
280 | 0 | ASSERT_NO_FATALS(StartCluster(ts_flags, master_flags)); |
281 | |
|
282 | 0 | TestWorkload workload(cluster_.get()); |
283 | 0 | workload.Setup(); // Convenient way to create a table. |
284 | |
|
285 | 0 | const int kLeaderIndex = 0; |
286 | 0 | TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()].get(); |
287 | 0 | const int kFollowerIndex = 2; |
288 | 0 | TServerDetails* ts_to_remove = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()].get(); |
289 | | |
290 | | // Wait for tablet creation and then identify the tablet id. |
291 | 0 | vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
292 | 0 | ASSERT_OK(itest::WaitForNumTabletsOnTS(leader_ts, 1, timeout, &tablets)); |
293 | 0 | string tablet_id = tablets[0].tablet_status().tablet_id(); |
294 | | |
295 | | // Wait until all replicas are up and running. |
296 | 0 | for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { |
297 | 0 | ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()].get(), |
298 | 0 | tablet_id, timeout)); |
299 | 0 | } |
300 | | |
301 | | // Elect a leader (TS 0) |
302 | 0 | ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout)); |
303 | 0 | ASSERT_OK(itest::WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); // Wait for NO_OP. |
304 | | |
305 | | // Write a single row. |
306 | 0 | ASSERT_OK(WriteSimpleTestRow(leader_ts, tablet_id, 0, 0, "", timeout)); |
307 | | |
308 | | // Delay tablet applies in order to delay COMMIT messages to trigger KUDU-1233. |
309 | | // Then insert another row. |
310 | 0 | ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server_by_uuid(leader_ts->uuid()), |
311 | 0 | "TEST_tablet_inject_latency_on_apply_write_txn_ms", "5000")); |
312 | | |
313 | | // Kick off an async insert, which will be delayed for 5 seconds. This is |
314 | | // normally enough time to evict a replica, tombstone it, add it back, and |
315 | | // remotely bootstrap it when the log is only a few entries. |
316 | 0 | tserver::WriteRequestPB req; |
317 | 0 | tserver::WriteResponsePB resp; |
318 | 0 | CountDownLatch latch(1); |
319 | 0 | rpc::RpcController rpc; |
320 | 0 | rpc.set_timeout(timeout); |
321 | 0 | req.set_tablet_id(tablet_id); |
322 | 0 | AddTestRowInsert(1, 1, "", &req); |
323 | 0 | leader_ts->tserver_proxy->WriteAsync(req, &resp, &rpc, [&latch]() { latch.CountDown(); }); |
324 | | |
325 | | // Wait for the replicate to show up (this doesn't wait for COMMIT messages). |
326 | 0 | ASSERT_OK(itest::WaitForServersToAgree(timeout, ts_map_, tablet_id, 3)); |
327 | 0 | ASSERT_OK(itest::WaitUntilCommittedOpIdIndexIs(3, leader_ts, tablet_id, timeout)); |
328 | | |
329 | | // Manually evict the server from the cluster, tombstone the replica, then |
330 | | // add the replica back to the cluster. Without the fix for KUDU-1233, this |
331 | | // will cause the replica to fail to start up. |
332 | 0 | ASSERT_OK(itest::RemoveServer(leader_ts, tablet_id, ts_to_remove, boost::none, timeout)); |
333 | 0 | ASSERT_OK(itest::DeleteTablet(ts_to_remove, tablet_id, TABLET_DATA_TOMBSTONED, |
334 | 0 | boost::none, timeout)); |
335 | 0 | ASSERT_OK(itest::AddServer(leader_ts, tablet_id, ts_to_remove, PeerMemberType::PRE_VOTER, |
336 | 0 | boost::none, timeout)); |
337 | 0 | ASSERT_OK(itest::WaitUntilTabletRunning(ts_to_remove, tablet_id, timeout)); |
338 | |
|
339 | 0 | ClusterVerifier cluster_verifier(cluster_.get()); |
340 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckCluster()); |
341 | 0 | ASSERT_NO_FATALS(cluster_verifier.CheckRowCount(workload.table_name(), |
342 | 0 | ClusterVerifier::EXACTLY, 2)); |
343 | |
|
344 | 0 | latch.Wait(); // Avoid use-after-free on the response from the delayed RPC callback. |
345 | 0 | } |
346 | | |
347 | | } // namespace yb |