/Users/deen/code/yugabyte-db/src/yb/integration-tests/tablet-split-itest.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #include <chrono> |
15 | | #include <thread> |
16 | | |
17 | | #include <gtest/gtest.h> |
18 | | |
19 | | #include "yb/client/table.h" |
20 | | |
21 | | #include "yb/common/entity_ids_types.h" |
22 | | #include "yb/common/ql_expr.h" |
23 | | #include "yb/common/ql_value.h" |
24 | | #include "yb/common/schema.h" |
25 | | #include "yb/common/wire_protocol.h" |
26 | | |
27 | | #include "yb/consensus/consensus.h" |
28 | | #include "yb/consensus/consensus.pb.h" |
29 | | #include "yb/consensus/consensus.proxy.h" |
30 | | #include "yb/consensus/consensus_util.h" |
31 | | |
32 | | #include "yb/docdb/doc_key.h" |
33 | | |
34 | | #include "yb/fs/fs_manager.h" |
35 | | |
36 | | #include "yb/gutil/dynamic_annotations.h" |
37 | | #include "yb/gutil/strings/join.h" |
38 | | #include "yb/gutil/strings/util.h" |
39 | | |
40 | | #include "yb/integration-tests/cluster_itest_util.h" |
41 | | #include "yb/integration-tests/redis_table_test_base.h" |
42 | | #include "yb/integration-tests/tablet-split-itest-base.h" |
43 | | #include "yb/integration-tests/test_workload.h" |
44 | | |
45 | | #include "yb/master/catalog_entity_info.h" |
46 | | #include "yb/master/catalog_manager_if.h" |
47 | | #include "yb/master/master_client.pb.h" |
48 | | #include "yb/master/master_defaults.h" |
49 | | #include "yb/master/master_error.h" |
50 | | #include "yb/master/master_heartbeat.pb.h" |
51 | | |
52 | | #include "yb/rocksdb/db.h" |
53 | | |
54 | | #include "yb/rpc/messenger.h" |
55 | | #include "yb/rpc/proxy.h" |
56 | | #include "yb/rpc/rpc_controller.h" |
57 | | |
58 | | #include "yb/tablet/tablet.h" |
59 | | #include "yb/tablet/tablet_metadata.h" |
60 | | #include "yb/tablet/tablet_peer.h" |
61 | | |
62 | | #include "yb/tserver/mini_tablet_server.h" |
63 | | #include "yb/tserver/tablet_server.h" |
64 | | #include "yb/tserver/ts_tablet_manager.h" |
65 | | #include "yb/tserver/tserver_admin.pb.h" |
66 | | #include "yb/tserver/tserver_admin.proxy.h" |
67 | | #include "yb/tserver/tserver_service.pb.h" |
68 | | |
69 | | #include "yb/util/atomic.h" |
70 | | #include "yb/util/format.h" |
71 | | #include "yb/util/protobuf_util.h" |
72 | | #include "yb/util/random_util.h" |
73 | | #include "yb/util/size_literals.h" |
74 | | #include "yb/util/status.h" |
75 | | #include "yb/util/status_format.h" |
76 | | #include "yb/util/status_log.h" |
77 | | #include "yb/util/test_util.h" |
78 | | #include "yb/util/tsan_util.h" |
79 | | |
80 | | using namespace std::literals; // NOLINT |
81 | | using namespace yb::client::kv_table_test; // NOLINT |
82 | | |
83 | | DECLARE_int64(db_block_size_bytes); |
84 | | DECLARE_int64(db_write_buffer_size); |
85 | | DECLARE_bool(enable_load_balancing); |
86 | | DECLARE_int32(load_balancer_max_concurrent_adds); |
87 | | DECLARE_int32(load_balancer_max_concurrent_removals); |
88 | | DECLARE_int64(rocksdb_compact_flush_rate_limit_bytes_per_sec); |
89 | | DECLARE_int32(rocksdb_level0_file_num_compaction_trigger); |
90 | | DECLARE_bool(rocksdb_disable_compactions); |
91 | | DECLARE_bool(TEST_do_not_start_election_test_only); |
92 | | DECLARE_int32(TEST_apply_tablet_split_inject_delay_ms); |
93 | | DECLARE_int32(heartbeat_interval_ms); |
94 | | DECLARE_int32(leader_lease_duration_ms); |
95 | | DECLARE_int32(raft_heartbeat_interval_ms); |
96 | | DECLARE_double(leader_failure_max_missed_heartbeat_periods); |
97 | | DECLARE_bool(TEST_skip_deleting_split_tablets); |
98 | | DECLARE_uint64(tablet_split_limit_per_table); |
99 | | DECLARE_bool(TEST_pause_before_post_split_compaction); |
100 | | DECLARE_int32(TEST_slowdown_backfill_alter_table_rpcs_ms); |
101 | | DECLARE_int32(rocksdb_base_background_compactions); |
102 | | DECLARE_int32(rocksdb_max_background_compactions); |
103 | | DECLARE_bool(enable_automatic_tablet_splitting); |
104 | | DECLARE_int64(tablet_split_low_phase_shard_count_per_node); |
105 | | DECLARE_int64(tablet_split_high_phase_shard_count_per_node); |
106 | | DECLARE_int64(tablet_split_low_phase_size_threshold_bytes); |
107 | | DECLARE_int64(tablet_split_high_phase_size_threshold_bytes); |
108 | | DECLARE_int64(tablet_force_split_threshold_bytes); |
109 | | DECLARE_int32(tserver_heartbeat_metrics_interval_ms); |
110 | | DECLARE_bool(TEST_validate_all_tablet_candidates); |
111 | | DECLARE_uint64(outstanding_tablet_split_limit); |
112 | | DECLARE_double(TEST_fail_tablet_split_probability); |
113 | | DECLARE_bool(TEST_skip_post_split_compaction); |
114 | | DECLARE_int32(TEST_nodes_per_cloud); |
115 | | DECLARE_int32(replication_factor); |
116 | | DECLARE_int32(txn_max_apply_batch_records); |
117 | | DECLARE_int32(TEST_pause_and_skip_apply_intents_task_loop_ms); |
118 | | DECLARE_bool(TEST_pause_tserver_get_split_key); |
119 | | DECLARE_bool(TEST_reject_delete_not_serving_tablet_rpc); |
120 | | DECLARE_int32(timestamp_history_retention_interval_sec); |
121 | | DECLARE_int64(db_block_cache_size_bytes); |
122 | | |
123 | | |
124 | | namespace yb { |
125 | | class TabletSplitITestWithIsolationLevel : public TabletSplitITest, |
126 | | public testing::WithParamInterface<IsolationLevel> { |
127 | | public: |
128 | 3 | void SetUp() override { |
129 | 3 | SetIsolationLevel(GetParam()); |
130 | 3 | TabletSplitITest::SetUp(); |
131 | 3 | } |
132 | | }; |
133 | | |
134 | | // Tests splitting of the single tablet in following steps: |
135 | | // - Create single-tablet table and populates it with specified number of rows. |
136 | | // - Do full scan using `select count(*)`. |
137 | | // - Send SplitTablet RPC to the tablet leader. |
138 | | // - After tablet split is completed - check that new tablets have exactly the same rows. |
139 | | // - Check that source tablet is rejecting reads and writes. |
140 | | // - Do full scan using `select count(*)`. |
141 | | // - Restart cluster. |
142 | | // - ClusterVerifier will check cluster integrity at the end of the test. |
143 | | |
144 | 0 | TEST_P(TabletSplitITestWithIsolationLevel, SplitSingleTablet) { |
145 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_deleting_split_tablets) = true; |
146 | | |
147 | | // TODO(tsplit): add delay of applying part of intents after tablet is split. |
148 | | // TODO(tsplit): test split during long-running transactions. |
149 | |
|
150 | 0 | constexpr auto kNumRows = kDefaultNumRows; |
151 | |
|
152 | 0 | const auto source_tablet_id = ASSERT_RESULT(CreateSingleTabletAndSplit(kNumRows)); |
153 | |
|
154 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_deleting_split_tablets) = false; |
155 | |
|
156 | 0 | ASSERT_OK(WaitForTabletSplitCompletion(/* expected_non_split_tablets =*/ 2)); |
157 | |
|
158 | 0 | ASSERT_OK(CheckRowsCount(kNumRows)); |
159 | |
|
160 | 0 | ASSERT_OK(WriteRows(kNumRows, kNumRows + 1)); |
161 | |
|
162 | 0 | ASSERT_OK(cluster_->RestartSync()); |
163 | |
|
164 | 0 | ASSERT_OK(CheckPostSplitTabletReplicasData(kNumRows * 2)); |
165 | 0 | } |
166 | | |
167 | 0 | TEST_F(TabletSplitITest, SplitTabletIsAsync) { |
168 | 0 | constexpr auto kNumRows = kDefaultNumRows; |
169 | |
|
170 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_before_post_split_compaction) = true; |
171 | |
|
172 | 0 | ASSERT_OK(CreateSingleTabletAndSplit(kNumRows)); |
173 | |
|
174 | 0 | for (auto peer : ASSERT_RESULT(ListPostSplitChildrenTabletPeers())) { |
175 | 0 | EXPECT_FALSE(peer->tablet()->metadata()->has_been_fully_compacted()); |
176 | 0 | } |
177 | 0 | std::this_thread::sleep_for(1s * kTimeMultiplier); |
178 | 0 | for (auto peer : ASSERT_RESULT(ListPostSplitChildrenTabletPeers())) { |
179 | 0 | EXPECT_FALSE(peer->tablet()->metadata()->has_been_fully_compacted()); |
180 | 0 | } |
181 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_before_post_split_compaction) = false; |
182 | 0 | ASSERT_OK(WaitForTestTablePostSplitTabletsFullyCompacted(15s * kTimeMultiplier)); |
183 | 0 | } |
184 | | |
185 | 0 | TEST_F(TabletSplitITest, ParentTabletCleanup) { |
186 | 0 | constexpr auto kNumRows = kDefaultNumRows; |
187 | |
|
188 | 0 | ASSERT_OK(CreateSingleTabletAndSplit(kNumRows)); |
189 | | |
190 | | // This will make client first try to access deleted tablet and that should be handled correctly. |
191 | 0 | ASSERT_OK(CheckRowsCount(kNumRows)); |
192 | 0 | } |
193 | | |
194 | | class TabletSplitNoBlockCacheITest : public TabletSplitITest { |
195 | | public: |
196 | 1 | void SetUp() override { |
197 | 1 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_db_block_cache_size_bytes) = -2; |
198 | 1 | TabletSplitITest::SetUp(); |
199 | 1 | } |
200 | | }; |
201 | | |
202 | 0 | TEST_F_EX(TabletSplitITest, TestInitiatesCompactionAfterSplit, TabletSplitNoBlockCacheITest) { |
203 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_deleting_split_tablets) = true; |
204 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_outstanding_tablet_split_limit) = 5; |
205 | 0 | constexpr auto kNumRows = kDefaultNumRows; |
206 | 0 | constexpr auto kNumPostSplitTablets = 2; |
207 | |
|
208 | 0 | CreateSingleTablet(); |
209 | 0 | const auto split_hash_code = ASSERT_RESULT(WriteRowsAndGetMiddleHashCode(kNumRows)); |
210 | |
|
211 | 0 | ASSERT_OK(SplitTabletAndValidate(split_hash_code, kNumRows)); |
212 | 0 | ASSERT_OK(LoggedWaitFor( |
213 | 0 | [this]() -> Result<bool> { |
214 | 0 | const auto count = VERIFY_RESULT(NumPostSplitTabletPeersFullyCompacted()); |
215 | 0 | return count >= kNumPostSplitTablets * ANNOTATE_UNPROTECTED_READ(FLAGS_replication_factor); |
216 | 0 | }, |
217 | 0 | 15s * kTimeMultiplier, "Waiting for post-split tablets to be fully compacted...")); |
218 | | |
219 | | // Get the sum of compaction bytes read by each child tablet replica grouped by peer uuid |
220 | 0 | auto replicas = ListTableActiveTabletPeers(cluster_.get(), ASSERT_RESULT(GetTestTableId())); |
221 | 0 | ASSERT_EQ(replicas.size(), |
222 | 0 | kNumPostSplitTablets * ANNOTATE_UNPROTECTED_READ(FLAGS_replication_factor)); |
223 | 0 | std::unordered_map<std::string, uint64_t> child_replicas_bytes_read; |
224 | 0 | for (const auto& replica : replicas) { |
225 | 0 | auto replica_bytes = replica->tablet()->regulardb_statistics()->getTickerCount( |
226 | 0 | rocksdb::Tickers::COMPACT_READ_BYTES); |
227 | 0 | ASSERT_GT(replica_bytes, 0) << "Expected replica's read bytes to be greater than zero."; |
228 | 0 | child_replicas_bytes_read[replica->permanent_uuid()] += replica_bytes; |
229 | 0 | } |
230 | | |
231 | | // Get parent tablet's bytes written and also check value is the same for all replicas |
232 | 0 | replicas = ASSERT_RESULT(ListSplitCompleteTabletPeers()); |
233 | 0 | ASSERT_EQ(replicas.size(), ANNOTATE_UNPROTECTED_READ(FLAGS_replication_factor)); |
234 | 0 | uint64_t pre_split_sst_files_size = 0; |
235 | 0 | for (const auto& replica : replicas) { |
236 | 0 | auto replica_bytes = replica->tablet()->GetCurrentVersionSstFilesSize(); |
237 | 0 | ASSERT_GT(replica_bytes, 0) << "Expected replica's SST file size to be greater than zero."; |
238 | 0 | if (pre_split_sst_files_size == 0) { |
239 | 0 | pre_split_sst_files_size = replica_bytes; |
240 | 0 | } else { |
241 | 0 | ASSERT_EQ(replica_bytes, pre_split_sst_files_size) |
242 | 0 | << "Expected the number of SST files size at each replica to be the same."; |
243 | 0 | } |
244 | 0 | } |
245 | | |
246 | | // Make sure that during child tablets compaction we don't read the same row twice, in other words |
247 | | // we don't process parent tablet rows that are not served by child tablet. A specific scaling |
248 | | // factor is used to measure relation between child replicas bytes read and parent SST files |
249 | | // due to unpredictable space overhead for reading files and SST files sturcture after the split. |
250 | 0 | constexpr double kScalingFactor = 1.05; |
251 | 0 | const double child_replicas_bytes_read_upper_bound = pre_split_sst_files_size * kScalingFactor; |
252 | 0 | uint64_t post_split_bytes_read = 0; |
253 | 0 | for (const auto& replica_stat : child_replicas_bytes_read) { |
254 | 0 | if (post_split_bytes_read == 0) { |
255 | 0 | post_split_bytes_read = replica_stat.second; |
256 | 0 | } else { |
257 | 0 | ASSERT_EQ(replica_stat.second, post_split_bytes_read); |
258 | 0 | } |
259 | | // There are two ways to resolve a failure at the point if happens. The first one is to increase |
260 | | // the value of kScalingFactor but this approach is not very accurate. The second way is to |
261 | | // use rocksdb::EventListener to retrieve CompactionJobInfo.info.stats.num_input_records and |
262 | | // to measure it with the number of records in regular DB via TableProperties::num_entries, |
263 | | // see VerifyTableProperties() for an example. |
264 | 0 | ASSERT_LE(static_cast<double>(post_split_bytes_read), child_replicas_bytes_read_upper_bound); |
265 | 0 | } |
266 | 0 | } |
267 | | |
268 | | // Test for https://github.com/yugabyte/yugabyte-db/issues/8295. |
269 | | // Checks that slow post-split tablet compaction doesn't block that tablet's cleanup. |
270 | 0 | TEST_F(TabletSplitITest, PostSplitCompactionDoesntBlockTabletCleanup) { |
271 | 0 | constexpr auto kNumRows = kDefaultNumRows; |
272 | 0 | const MonoDelta kCleanupTimeout = 15s * kTimeMultiplier; |
273 | |
|
274 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_load_balancing) = false; |
275 | | // Keep tablets without compaction after split. |
276 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_before_post_split_compaction) = true; |
277 | |
|
278 | 0 | ASSERT_OK(CreateSingleTabletAndSplit(kNumRows)); |
279 | |
|
280 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_do_not_start_election_test_only) = true; |
281 | 0 | auto tablet_peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
282 | 0 | ASSERT_EQ(tablet_peers.size(), 2); |
283 | 0 | const auto first_child_tablet = tablet_peers[0]->shared_tablet(); |
284 | 0 | ASSERT_OK(first_child_tablet->Flush(tablet::FlushMode::kSync)); |
285 | | // Force compact on leader, so we can split first_child_tablet. |
286 | 0 | ASSERT_OK(first_child_tablet->ForceFullRocksDBCompact()); |
287 | | // Turn off split tablets cleanup in order to later turn it on during compaction of the |
288 | | // first_child_tablet to make sure manual compaction won't block tablet shutdown. |
289 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_deleting_split_tablets) = true; |
290 | 0 | ASSERT_OK(SplitTablet(ASSERT_RESULT(catalog_manager()), *first_child_tablet)); |
291 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_do_not_start_election_test_only) = false; |
292 | 0 | ASSERT_OK(WaitForTabletSplitCompletion( |
293 | 0 | /* expected_non_split_tablets = */ 3, /* expected_split_tablets = */ 1)); |
294 | | |
295 | | // Simulate slow compaction, so it takes at least kCleanupTimeout * 1.5 for first child tablet |
296 | | // followers. |
297 | 0 | const auto original_compact_flush_rate_bytes_per_sec = |
298 | 0 | FLAGS_rocksdb_compact_flush_rate_limit_bytes_per_sec; |
299 | 0 | SetCompactFlushRateLimitBytesPerSec( |
300 | 0 | cluster_.get(), |
301 | 0 | first_child_tablet->GetCurrentVersionSstFilesSize() / (kCleanupTimeout.ToSeconds() * 1.5)); |
302 | | // Resume post-split compaction. |
303 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_before_post_split_compaction) = false; |
304 | | // Turn on split tablets cleanup. |
305 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_deleting_split_tablets) = false; |
306 | | |
307 | | // Cleanup of first_child_tablet will shutdown that tablet after deletion and we check that |
308 | | // shutdown is not stuck due to its slow post-split compaction. |
309 | 0 | const auto wait_message = |
310 | 0 | Format("Waiting for tablet $0 cleanup", first_child_tablet->tablet_id()); |
311 | 0 | LOG(INFO) << wait_message << "..."; |
312 | 0 | std::vector<Result<tablet::TabletPeerPtr>> first_child_tablet_peer_results; |
313 | 0 | const auto s = WaitFor( |
314 | 0 | [this, &first_child_tablet, &first_child_tablet_peer_results] { |
315 | 0 | first_child_tablet_peer_results.clear(); |
316 | 0 | for (auto mini_ts : cluster_->mini_tablet_servers()) { |
317 | 0 | auto tablet_peer_result = |
318 | 0 | mini_ts->server()->tablet_manager()->LookupTablet(first_child_tablet->tablet_id()); |
319 | 0 | if (tablet_peer_result.ok() || !tablet_peer_result.status().IsNotFound()) { |
320 | 0 | first_child_tablet_peer_results.push_back(tablet_peer_result); |
321 | 0 | } |
322 | 0 | } |
323 | 0 | return first_child_tablet_peer_results.empty(); |
324 | 0 | }, |
325 | 0 | kCleanupTimeout, wait_message); |
326 | 0 | for (const auto& peer_result : first_child_tablet_peer_results) { |
327 | 0 | LOG(INFO) << "Tablet peer not cleaned: " |
328 | 0 | << (peer_result.ok() ? (*peer_result)->LogPrefix() : AsString(peer_result.status())); |
329 | 0 | } |
330 | 0 | ASSERT_OK(s); |
331 | 0 | LOG(INFO) << wait_message << " - DONE"; |
332 | |
|
333 | 0 | SetCompactFlushRateLimitBytesPerSec(cluster_.get(), original_compact_flush_rate_bytes_per_sec); |
334 | 0 | } |
335 | | |
336 | 0 | TEST_F(TabletSplitITest, TestLoadBalancerAndSplit) { |
337 | 0 | constexpr auto kNumRows = kDefaultNumRows; |
338 | | |
339 | | // To speed up load balancing (it also processes transaction status tablets). |
340 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_load_balancer_max_concurrent_adds) = 5; |
341 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_load_balancer_max_concurrent_removals) = 5; |
342 | | |
343 | | // Keep tablets without compaction after split. |
344 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_before_post_split_compaction) = true; |
345 | |
|
346 | 0 | CreateSingleTablet(); |
347 | |
|
348 | 0 | const auto split_hash_code = ASSERT_RESULT(WriteRowsAndGetMiddleHashCode(kNumRows)); |
349 | |
|
350 | 0 | ASSERT_OK(SplitTabletAndValidate(split_hash_code, kNumRows)); |
351 | |
|
352 | 0 | auto test_table_id = ASSERT_RESULT(GetTestTableId()); |
353 | 0 | auto test_tablet_ids = ListTabletIdsForTable(cluster_.get(), test_table_id); |
354 | | |
355 | | // Verify that heartbeat contains flag should_disable_lb_move for all tablets of the test |
356 | | // table on each tserver to have after split. |
357 | 0 | for (size_t i = 0; i != cluster_->num_tablet_servers(); ++i) { |
358 | 0 | auto* ts_manager = cluster_->mini_tablet_server(i)->server()->tablet_manager(); |
359 | |
|
360 | 0 | master::TabletReportPB report; |
361 | 0 | ts_manager->GenerateTabletReport(&report); |
362 | 0 | for (const auto& reported_tablet : report.updated_tablets()) { |
363 | 0 | if (test_tablet_ids.count(reported_tablet.tablet_id()) == 0) { |
364 | 0 | continue; |
365 | 0 | } |
366 | 0 | ASSERT_TRUE(reported_tablet.should_disable_lb_move()); |
367 | 0 | } |
368 | 0 | } |
369 | | |
370 | | // Add new tserver in to force load balancer moves. |
371 | 0 | auto new_ts = cluster_->num_tablet_servers(); |
372 | 0 | ASSERT_OK(cluster_->AddTabletServer()); |
373 | 0 | ASSERT_OK(cluster_->WaitForTabletServerCount(new_ts + 1)); |
374 | 0 | const auto new_ts_uuid = cluster_->mini_tablet_server(new_ts)->server()->permanent_uuid(); |
375 | |
|
376 | 0 | LOG(INFO) << "Added new tserver: " << new_ts_uuid; |
377 | | |
378 | | // Wait for the LB run. |
379 | 0 | const auto lb_wait_period = MonoDelta::FromMilliseconds( |
380 | 0 | FLAGS_catalog_manager_bg_task_wait_ms * 2 + FLAGS_raft_heartbeat_interval_ms * 2); |
381 | 0 | SleepFor(lb_wait_period); |
382 | | |
383 | | // Verify none test tablet replica on the new tserver. |
384 | 0 | for (const auto& tablet : ASSERT_RESULT(GetTabletInfosForTable(test_table_id))) { |
385 | 0 | const auto replica_map = tablet->GetReplicaLocations(); |
386 | 0 | ASSERT_TRUE(replica_map->find(new_ts_uuid) == replica_map->end()) |
387 | 0 | << "Not expected tablet " << tablet->id() |
388 | 0 | << " to be on newly added tserver: " << new_ts_uuid; |
389 | 0 | } |
390 | | |
391 | | // Verify that custom placement info is honored when tablets are split. |
392 | 0 | const auto& blacklisted_ts = *ASSERT_NOTNULL(cluster_->mini_tablet_server(1)); |
393 | 0 | const auto blacklisted_ts_uuid = blacklisted_ts.server()->permanent_uuid(); |
394 | 0 | ASSERT_OK(cluster_->AddTServerToBlacklist(blacklisted_ts)); |
395 | 0 | LOG(INFO) << "Blacklisted tserver: " << blacklisted_ts_uuid; |
396 | 0 | std::vector<TabletId> on_blacklisted_ts; |
397 | 0 | std::vector<TabletId> no_replicas_on_new_ts; |
398 | 0 | auto s = LoggedWaitFor( |
399 | 0 | [&] { |
400 | 0 | auto tablet_infos = GetTabletInfosForTable(test_table_id); |
401 | 0 | if (!tablet_infos.ok()) { |
402 | 0 | return false; |
403 | 0 | } |
404 | 0 | on_blacklisted_ts.clear(); |
405 | 0 | no_replicas_on_new_ts.clear(); |
406 | 0 | for (const auto& tablet : *tablet_infos) { |
407 | 0 | auto replica_map = tablet->GetReplicaLocations(); |
408 | 0 | if (replica_map->count(new_ts_uuid) == 0) { |
409 | 0 | no_replicas_on_new_ts.push_back(tablet->id()); |
410 | 0 | } |
411 | 0 | if (replica_map->count(blacklisted_ts_uuid) > 0) { |
412 | 0 | on_blacklisted_ts.push_back(tablet->id()); |
413 | 0 | } |
414 | 0 | } |
415 | 0 | return on_blacklisted_ts.empty() && no_replicas_on_new_ts.empty(); |
416 | 0 | }, |
417 | 0 | 60s * kTimeMultiplier, |
418 | 0 | Format( |
419 | 0 | "Wait for all test tablet replicas to be moved from tserver $0 to $1 on master", |
420 | 0 | blacklisted_ts_uuid, new_ts_uuid)); |
421 | 0 | ASSERT_TRUE(s.ok()) << Format( |
422 | 0 | "Replicas are still on blacklisted tserver $0: $1\nNo replicas for tablets on new tserver " |
423 | 0 | "$2: $3", |
424 | 0 | blacklisted_ts_uuid, on_blacklisted_ts, new_ts_uuid, no_replicas_on_new_ts); |
425 | | |
426 | 0 | ASSERT_OK(cluster_->ClearBlacklist()); |
427 | | // Wait for the LB run. |
428 | 0 | SleepFor(lb_wait_period); |
429 | | |
430 | | // Test tablets should not move until compaction. |
431 | 0 | for (const auto& tablet : ASSERT_RESULT(GetTabletInfosForTable(test_table_id))) { |
432 | 0 | const auto replica_map = tablet->GetReplicaLocations(); |
433 | 0 | ASSERT_TRUE(replica_map->find(blacklisted_ts_uuid) == replica_map->end()) |
434 | 0 | << "Not expected tablet " << tablet->id() << " to be on tserver " << blacklisted_ts_uuid |
435 | 0 | << " that moved out of blacklist before post-split compaction completed"; |
436 | 0 | } |
437 | |
|
438 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_before_post_split_compaction) = false; |
439 | |
|
440 | 0 | ASSERT_OK(WaitForTestTablePostSplitTabletsFullyCompacted(15s * kTimeMultiplier)); |
441 | |
|
442 | 0 | ASSERT_OK(LoggedWaitFor( |
443 | 0 | [&] { |
444 | 0 | auto tablet_infos = GetTabletInfosForTable(test_table_id); |
445 | 0 | if (!tablet_infos.ok()) { |
446 | 0 | return false; |
447 | 0 | } |
448 | 0 | for (const auto& tablet : *tablet_infos) { |
449 | 0 | auto replica_map = tablet->GetReplicaLocations(); |
450 | 0 | if (replica_map->find(blacklisted_ts_uuid) == replica_map->end()) { |
451 | 0 | return true; |
452 | 0 | } |
453 | 0 | } |
454 | 0 | return false; |
455 | 0 | }, |
456 | 0 | 60s * kTimeMultiplier, |
457 | 0 | Format( |
458 | 0 | "Wait for at least one test tablet replica on tserver that moved out of blacklist: $0", |
459 | 0 | blacklisted_ts_uuid))); |
460 | 0 | } |
461 | | |
462 | | // Start tablet split, create Index to start backfill while split operation in progress |
463 | | // and check backfill state. |
464 | 0 | TEST_F(TabletSplitITest, TestBackfillDuringSplit) { |
465 | 0 | constexpr auto kNumRows = 10000; |
466 | 0 | FLAGS_TEST_apply_tablet_split_inject_delay_ms = 200 * kTimeMultiplier; |
467 | |
|
468 | 0 | CreateSingleTablet(); |
469 | 0 | const auto split_hash_code = ASSERT_RESULT(WriteRowsAndGetMiddleHashCode(kNumRows)); |
470 | 0 | auto* catalog_mgr = ASSERT_RESULT(catalog_manager()); |
471 | 0 | auto table = catalog_mgr->GetTableInfo(table_->id()); |
472 | 0 | auto source_tablet_info = ASSERT_RESULT(GetSingleTestTabletInfo(catalog_mgr)); |
473 | 0 | const auto source_tablet_id = source_tablet_info->id(); |
474 | | |
475 | | // Send SplitTablet RPC to the tablet leader. |
476 | 0 | ASSERT_OK(catalog_mgr->TEST_SplitTablet(source_tablet_info, split_hash_code)); |
477 | |
|
478 | 0 | int indexed_column_index = 1; |
479 | 0 | const client::YBTableName index_name( |
480 | 0 | YQL_DATABASE_CQL, table_.name().namespace_name(), |
481 | 0 | table_.name().table_name() + '_' + |
482 | 0 | table_.schema().Column(indexed_column_index).name() + "_idx"); |
483 | | // Create index while split operation in progress |
484 | 0 | PrepareIndex(client::Transactional(GetIsolationLevel() != IsolationLevel::NON_TRANSACTIONAL), |
485 | 0 | index_name, indexed_column_index); |
486 | | |
487 | | // Check that source table is not backfilling and wait for tablet split completion |
488 | 0 | ASSERT_FALSE(table->IsBackfilling()); |
489 | 0 | ASSERT_OK(WaitForTabletSplitCompletion(2)); |
490 | 0 | ASSERT_OK(CheckPostSplitTabletReplicasData(kNumRows)); |
491 | |
|
492 | 0 | ASSERT_OK(index_.Open(index_name, client_.get())); |
493 | 0 | ASSERT_OK(WaitFor([&] { |
494 | 0 | auto rows_count = SelectRowsCount(NewSession(), index_); |
495 | 0 | if (!rows_count.ok()) { |
496 | 0 | return false; |
497 | 0 | } |
498 | 0 | return *rows_count == kNumRows; |
499 | 0 | }, 30s * kTimeMultiplier, "Waiting for backfill index")); |
500 | 0 | } |
501 | | |
502 | | // Create Index to start backfill, check split is not working while backfill in progress |
503 | | // and check backfill state. |
504 | 0 | TEST_F(TabletSplitITest, TestSplitDuringBackfill) { |
505 | 0 | constexpr auto kNumRows = 10000; |
506 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_validate_all_tablet_candidates) = false; |
507 | 0 | FLAGS_TEST_slowdown_backfill_alter_table_rpcs_ms = 200 * kTimeMultiplier; |
508 | |
|
509 | 0 | CreateSingleTablet(); |
510 | 0 | const auto split_hash_code = ASSERT_RESULT(WriteRowsAndGetMiddleHashCode(kNumRows)); |
511 | |
|
512 | 0 | int indexed_column_index = 1; |
513 | 0 | const client::YBTableName index_name( |
514 | 0 | YQL_DATABASE_CQL, table_.name().namespace_name(), |
515 | 0 | table_.name().table_name() + '_' + |
516 | 0 | table_.schema().Column(indexed_column_index).name() + "_idx"); |
517 | | // Create index and start backfill |
518 | 0 | PrepareIndex(client::Transactional(GetIsolationLevel() != IsolationLevel::NON_TRANSACTIONAL), |
519 | 0 | index_name, indexed_column_index); |
520 | |
|
521 | 0 | auto* catalog_mgr = ASSERT_RESULT(catalog_manager()); |
522 | 0 | auto table = catalog_mgr->GetTableInfo(table_->id()); |
523 | 0 | auto source_tablet_info = ASSERT_RESULT(GetSingleTestTabletInfo(catalog_mgr)); |
524 | 0 | const auto source_tablet_id = source_tablet_info->id(); |
525 | | |
526 | | // Check that source table is backfilling |
527 | 0 | ASSERT_OK(WaitFor([&] { |
528 | 0 | return table->IsBackfilling(); |
529 | 0 | }, 30s * kTimeMultiplier, "Waiting for start backfill index")); |
530 | | |
531 | | // Send SplitTablet RPC to the tablet leader while backfill in progress |
532 | 0 | ASSERT_NOK(catalog_mgr->TEST_SplitTablet(source_tablet_info, split_hash_code)); |
533 | |
|
534 | 0 | ASSERT_OK(WaitFor([&] { |
535 | 0 | return !table->IsBackfilling(); |
536 | 0 | }, 30s * kTimeMultiplier, "Waiting for backfill index")); |
537 | 0 | ASSERT_OK(catalog_mgr->TEST_SplitTablet(source_tablet_info, split_hash_code)); |
538 | 0 | ASSERT_OK(WaitForTabletSplitCompletion(2)); |
539 | 0 | ASSERT_OK(CheckPostSplitTabletReplicasData(kNumRows)); |
540 | 0 | } |
541 | | |
542 | | // Test for https://github.com/yugabyte/yugabyte-db/issues/4312 reproducing a deadlock |
543 | | // between TSTabletManager::ApplyTabletSplit and Heartbeater::Thread::TryHeartbeat. |
544 | 0 | TEST_F(TabletSplitITest, SlowSplitSingleTablet) { |
545 | 0 | const auto leader_failure_timeout = FLAGS_leader_failure_max_missed_heartbeat_periods * |
546 | 0 | FLAGS_raft_heartbeat_interval_ms; |
547 | |
|
548 | 0 | FLAGS_TEST_apply_tablet_split_inject_delay_ms = 200 * kTimeMultiplier; |
549 | | // We want heartbeater to be called during tablet split apply to reproduce deadlock bug. |
550 | 0 | FLAGS_heartbeat_interval_ms = FLAGS_TEST_apply_tablet_split_inject_delay_ms / 3; |
551 | | // We reduce FLAGS_leader_lease_duration_ms for ReplicaState::GetLeaderState to avoid always |
552 | | // reusing results from cache on heartbeat, otherwise it won't lock ReplicaState mutex. |
553 | 0 | FLAGS_leader_lease_duration_ms = FLAGS_TEST_apply_tablet_split_inject_delay_ms / 2; |
554 | | // Reduce raft_heartbeat_interval_ms for leader lease to be reliably replicated. |
555 | 0 | FLAGS_raft_heartbeat_interval_ms = FLAGS_leader_lease_duration_ms / 2; |
556 | | // Keep leader failure timeout the same to avoid flaky losses of leader with short heartbeats. |
557 | 0 | FLAGS_leader_failure_max_missed_heartbeat_periods = |
558 | 0 | leader_failure_timeout / FLAGS_raft_heartbeat_interval_ms; |
559 | |
|
560 | 0 | constexpr auto kNumRows = 50; |
561 | |
|
562 | 0 | ASSERT_OK(CreateSingleTabletAndSplit(kNumRows)); |
563 | 0 | } |
564 | | |
565 | 0 | TEST_F(TabletSplitITest, SplitTabletDuringReadWriteLoad) { |
566 | 0 | constexpr auto kNumTablets = 3; |
567 | |
|
568 | 0 | FLAGS_db_write_buffer_size = 100_KB; |
569 | |
|
570 | 0 | TestWorkload workload(cluster_.get()); |
571 | 0 | workload.set_table_name(client::kTableName); |
572 | 0 | workload.set_write_timeout_millis(MonoDelta(kRpcTimeout).ToMilliseconds()); |
573 | 0 | workload.set_num_tablets(kNumTablets); |
574 | 0 | workload.set_num_read_threads(4); |
575 | 0 | workload.set_num_write_threads(2); |
576 | 0 | workload.set_write_batch_size(50); |
577 | 0 | workload.set_payload_bytes(16); |
578 | 0 | workload.set_sequential_write(true); |
579 | 0 | workload.set_retry_on_restart_required_error(true); |
580 | 0 | workload.set_read_only_written_keys(true); |
581 | 0 | workload.Setup(); |
582 | |
|
583 | 0 | const auto test_table_id = ASSERT_RESULT(GetTestTableId()); |
584 | |
|
585 | 0 | auto peers = ASSERT_RESULT(WaitForTableActiveTabletLeadersPeers( |
586 | 0 | cluster_.get(), test_table_id, kNumTablets)); |
587 | |
|
588 | 0 | LOG(INFO) << "Starting workload ..."; |
589 | 0 | workload.Start(); |
590 | |
|
591 | 0 | for (const auto& peer : peers) { |
592 | 0 | ASSERT_OK(LoggedWaitFor( |
593 | 0 | [&peer] { |
594 | 0 | const auto data_size = |
595 | 0 | peer->tablet()->TEST_db()->GetCurrentVersionSstFilesUncompressedSize(); |
596 | 0 | YB_LOG_EVERY_N_SECS(INFO, 5) << "Data written: " << data_size; |
597 | 0 | size_t expected_size = (FLAGS_rocksdb_level0_file_num_compaction_trigger + 1) * |
598 | 0 | FLAGS_db_write_buffer_size; |
599 | 0 | return data_size > expected_size; |
600 | 0 | }, |
601 | 0 | 60s * kTimeMultiplier, Format("Writing data to split (tablet $0) ...", peer->tablet_id()))); |
602 | 0 | } |
603 | |
|
604 | 0 | DumpWorkloadStats(workload); |
605 | |
|
606 | 0 | auto* catalog_mgr = ASSERT_RESULT(catalog_manager()); |
607 | |
|
608 | 0 | for (const auto& peer : peers) { |
609 | 0 | const auto& source_tablet = *ASSERT_NOTNULL(peer->tablet()); |
610 | 0 | ASSERT_OK(SplitTablet(catalog_mgr, source_tablet)); |
611 | 0 | } |
612 | |
|
613 | 0 | ASSERT_OK(WaitForTabletSplitCompletion(/* expected_non_split_tablets =*/ kNumTablets * 2)); |
614 | |
|
615 | 0 | DumpTableLocations(catalog_mgr, client::kTableName); |
616 | | |
617 | | // Generate some more read/write traffic after tablets are split and after that we check data |
618 | | // for consistency and that failures rates are acceptable. |
619 | 0 | std::this_thread::sleep_for(5s); |
620 | |
|
621 | 0 | LOG(INFO) << "Stopping workload ..."; |
622 | 0 | workload.StopAndJoin(); |
623 | |
|
624 | 0 | DumpWorkloadStats(workload); |
625 | |
|
626 | 0 | ASSERT_NO_FATALS(CheckTableKeysInRange(workload.rows_inserted())); |
627 | |
|
628 | 0 | const auto insert_failure_rate = 1.0 * workload.rows_insert_failed() / workload.rows_inserted(); |
629 | 0 | const auto read_failure_rate = 1.0 * workload.rows_read_error() / workload.rows_read_ok(); |
630 | 0 | const auto read_try_again_rate = 1.0 * workload.rows_read_try_again() / workload.rows_read_ok(); |
631 | |
|
632 | 0 | ASSERT_LT(insert_failure_rate, 0.01); |
633 | 0 | ASSERT_LT(read_failure_rate, 0.01); |
634 | | // TODO(tsplit): lower this threshold as internal (without reaching client app) read retries |
635 | | // implemented for split tablets. |
636 | 0 | ASSERT_LT(read_try_again_rate, 0.1); |
637 | 0 | ASSERT_EQ(workload.rows_read_empty(), 0); |
638 | | |
639 | | // TODO(tsplit): Check with different isolation levels. |
640 | |
|
641 | 0 | ASSERT_OK(cluster_->RestartSync()); |
642 | 0 | } |
643 | | |
644 | 0 | void TabletSplitITest::SplitClientRequestsIds(int split_depth) { |
645 | | // Set data block size low enough, so we have enough data blocks for middle key |
646 | | // detection to work correctly. |
647 | 0 | FLAGS_db_block_size_bytes = 1_KB; |
648 | 0 | const auto kNumRows = 50 * (1 << split_depth); |
649 | |
|
650 | 0 | SetNumTablets(1); |
651 | 0 | CreateTable(); |
652 | |
|
653 | 0 | ASSERT_OK(WriteRows(kNumRows, 1)); |
654 | |
|
655 | 0 | ASSERT_OK(CheckRowsCount(kNumRows)); |
656 | |
|
657 | 0 | auto* catalog_mgr = ASSERT_RESULT(catalog_manager()); |
658 | |
|
659 | 0 | for (int i = 0; i < split_depth; ++i) { |
660 | 0 | auto peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
661 | 0 | ASSERT_EQ(peers.size(), 1 << i); |
662 | 0 | for (const auto& peer : peers) { |
663 | 0 | const auto tablet = peer->shared_tablet(); |
664 | 0 | ASSERT_OK(tablet->Flush(tablet::FlushMode::kSync)); |
665 | 0 | tablet->ForceRocksDBCompactInTest(); |
666 | 0 | ASSERT_OK(SplitTablet(catalog_mgr, *tablet)); |
667 | 0 | } |
668 | |
|
669 | 0 | ASSERT_OK(WaitForTabletSplitCompletion( |
670 | 0 | /* expected_non_split_tablets =*/ 1 << (i + 1))); |
671 | 0 | } |
672 | |
|
673 | 0 | Status s; |
674 | 0 | ASSERT_OK(WaitFor([&] { |
675 | 0 | s = ResultToStatus(WriteRows(1, 1)); |
676 | 0 | return !s.IsTryAgain(); |
677 | 0 | }, 60s * kTimeMultiplier, "Waiting for successful write")); |
678 | 0 | ASSERT_OK(s); |
679 | 0 | } |
680 | | |
681 | | // Test for https://github.com/yugabyte/yugabyte-db/issues/5415. |
682 | | // Client knows about split parent for final tablets. |
683 | 0 | TEST_F(TabletSplitITest, SplitClientRequestsIdsDepth1) { |
684 | 0 | SplitClientRequestsIds(1); |
685 | 0 | } |
686 | | |
687 | | // Test for https://github.com/yugabyte/yugabyte-db/issues/5415. |
688 | | // Client doesn't know about split parent for final tablets. |
689 | 0 | TEST_F(TabletSplitITest, SplitClientRequestsIdsDepth2) { |
690 | 0 | SplitClientRequestsIds(2); |
691 | 0 | } |
692 | | |
693 | 0 | TEST_F(TabletSplitITest, SplitSingleTabletWithLimit) { |
694 | 0 | FLAGS_db_block_size_bytes = 1_KB; |
695 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_validate_all_tablet_candidates) = false; |
696 | 0 | const auto kSplitDepth = 3; |
697 | 0 | const auto kNumRows = 50 * (1 << kSplitDepth); |
698 | 0 | FLAGS_tablet_split_limit_per_table = (1 << kSplitDepth) - 1; |
699 | |
|
700 | 0 | CreateSingleTablet(); |
701 | 0 | ASSERT_OK(WriteRows(kNumRows, 1)); |
702 | 0 | ASSERT_OK(CheckRowsCount(kNumRows)); |
703 | |
|
704 | 0 | auto* catalog_mgr = ASSERT_RESULT(catalog_manager()); |
705 | |
|
706 | 0 | master::TableIdentifierPB table_id_pb; |
707 | 0 | table_id_pb.set_table_id(table_->id()); |
708 | 0 | bool reached_split_limit = false; |
709 | |
|
710 | 0 | for (int i = 0; i < kSplitDepth; ++i) { |
711 | 0 | auto peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
712 | 0 | bool expect_split = false; |
713 | 0 | for (const auto& peer : peers) { |
714 | 0 | const auto tablet = peer->shared_tablet(); |
715 | 0 | ASSERT_OK(tablet->Flush(tablet::FlushMode::kSync)); |
716 | 0 | tablet->ForceRocksDBCompactInTest(); |
717 | 0 | auto table_info = ASSERT_RESULT(catalog_mgr->FindTable(table_id_pb)); |
718 | |
|
719 | 0 | expect_split = table_info->NumPartitions() < FLAGS_tablet_split_limit_per_table; |
720 | |
|
721 | 0 | if (expect_split) { |
722 | 0 | ASSERT_OK(DoSplitTablet(catalog_mgr, *tablet)); |
723 | 0 | } else { |
724 | 0 | const auto split_status = DoSplitTablet(catalog_mgr, *tablet); |
725 | 0 | ASSERT_EQ(master::MasterError(split_status), |
726 | 0 | master::MasterErrorPB::REACHED_SPLIT_LIMIT); |
727 | 0 | reached_split_limit = true; |
728 | 0 | } |
729 | 0 | } |
730 | 0 | if (expect_split) { |
731 | 0 | ASSERT_OK(WaitForTabletSplitCompletion( |
732 | 0 | /* expected_non_split_tablets =*/1 << (i + 1))); |
733 | 0 | } |
734 | 0 | } |
735 | |
|
736 | 0 | ASSERT_TRUE(reached_split_limit); |
737 | |
|
738 | 0 | Status s; |
739 | 0 | ASSERT_OK(WaitFor([&] { |
740 | 0 | s = ResultToStatus(WriteRows(1, 1)); |
741 | 0 | return !s.IsTryAgain(); |
742 | 0 | }, 60s * kTimeMultiplier, "Waiting for successful write")); |
743 | |
|
744 | 0 | auto table_info = ASSERT_RESULT(catalog_mgr->FindTable(table_id_pb)); |
745 | 0 | ASSERT_EQ(table_info->NumPartitions(), FLAGS_tablet_split_limit_per_table); |
746 | 0 | } |
747 | | |
748 | 0 | TEST_F(TabletSplitITest, SplitDuringReplicaOffline) { |
749 | 0 | constexpr auto kNumRows = kDefaultNumRows; |
750 | |
|
751 | 0 | SetNumTablets(1); |
752 | 0 | CreateTable(); |
753 | |
|
754 | 0 | const auto split_hash_code = ASSERT_RESULT(WriteRowsAndGetMiddleHashCode(kNumRows)); |
755 | |
|
756 | 0 | auto rows_count = ASSERT_RESULT(SelectRowsCount(NewSession(), table_)); |
757 | 0 | ASSERT_EQ(rows_count, kNumRows); |
758 | |
|
759 | 0 | auto* catalog_mgr = ASSERT_RESULT(catalog_manager()); |
760 | |
|
761 | 0 | auto source_tablet_info = ASSERT_RESULT(GetSingleTestTabletInfo(catalog_mgr)); |
762 | 0 | const auto source_tablet_id = source_tablet_info->id(); |
763 | |
|
764 | 0 | cluster_->mini_tablet_server(0)->Shutdown(); |
765 | |
|
766 | 0 | LOG(INFO) << "Stopped TS-1"; |
767 | |
|
768 | 0 | ASSERT_OK(catalog_mgr->TEST_SplitTablet(source_tablet_info, split_hash_code)); |
769 | |
|
770 | 0 | ASSERT_OK(WaitForTabletSplitCompletion( |
771 | 0 | /* expected_non_split_tablets =*/ 2, /* expected_split_tablets =*/ 1, |
772 | 0 | /* num_replicas_online =*/ 2)); |
773 | |
|
774 | 0 | ASSERT_OK(CheckPostSplitTabletReplicasData(kNumRows, 2)); |
775 | |
|
776 | 0 | ASSERT_OK(CheckSourceTabletAfterSplit(source_tablet_id)); |
777 | |
|
778 | 0 | DumpTableLocations(catalog_mgr, client::kTableName); |
779 | |
|
780 | 0 | rows_count = ASSERT_RESULT(SelectRowsCount(NewSession(), table_)); |
781 | 0 | ASSERT_EQ(rows_count, kNumRows); |
782 | |
|
783 | 0 | ASSERT_OK(WriteRows(kNumRows, kNumRows + 1)); |
784 | |
|
785 | 0 | LOG(INFO) << "Starting TS-1"; |
786 | |
|
787 | 0 | ASSERT_OK(cluster_->mini_tablet_server(0)->Start()); |
788 | | |
789 | | // This time we expect all replicas to be online. |
790 | 0 | ASSERT_OK(WaitForTabletSplitCompletion( |
791 | 0 | /* expected_non_split_tablets =*/ 2, /* expected_split_tablets =*/ 0)); |
792 | |
|
793 | 0 | Status s; |
794 | 0 | ASSERT_OK_PREPEND(LoggedWaitFor([&] { |
795 | 0 | s = CheckPostSplitTabletReplicasData(kNumRows * 2); |
796 | 0 | return s.IsOk(); |
797 | 0 | }, 30s * kTimeMultiplier, "Waiting for TS-1 to catch up ..."), AsString(s)); |
798 | 0 | } |
799 | | |
800 | | // Test for https://github.com/yugabyte/yugabyte-db/issues/6890. |
801 | | // Writes data to the tablet, splits it and then tries to do full scan with `select count(*)` |
802 | | // using two different instances of YBTable one after another. |
803 | 0 | TEST_F(TabletSplitITest, DifferentYBTableInstances) { |
804 | 0 | constexpr auto kNumRows = kDefaultNumRows; |
805 | |
|
806 | 0 | CreateSingleTablet(); |
807 | |
|
808 | 0 | client::TableHandle table1, table2; |
809 | 0 | for (auto* table : {&table1, &table2}) { |
810 | 0 | ASSERT_OK(table->Open(client::kTableName, client_.get())); |
811 | 0 | } |
812 | |
|
813 | 0 | const auto split_hash_code = ASSERT_RESULT(WriteRowsAndGetMiddleHashCode(kNumRows)); |
814 | 0 | const auto source_tablet_id = ASSERT_RESULT(SplitTabletAndValidate(split_hash_code, kNumRows)); |
815 | |
|
816 | 0 | ASSERT_OK(WaitForTabletSplitCompletion(/* expected_non_split_tablets =*/ 2)); |
817 | |
|
818 | 0 | auto rows_count = ASSERT_RESULT(SelectRowsCount(NewSession(), table1)); |
819 | 0 | ASSERT_EQ(rows_count, kNumRows); |
820 | |
|
821 | 0 | rows_count = ASSERT_RESULT(SelectRowsCount(NewSession(), table2)); |
822 | 0 | ASSERT_EQ(rows_count, kNumRows); |
823 | 0 | } |
824 | | |
825 | 0 | TEST_F(TabletSplitITest, SplitSingleTabletLongTransactions) { |
826 | 0 | constexpr auto kNumRows = 1000; |
827 | 0 | constexpr auto kNumApplyLargeTxnBatches = 10; |
828 | 0 | FLAGS_txn_max_apply_batch_records = kNumRows / kNumApplyLargeTxnBatches; |
829 | 0 | FLAGS_TEST_pause_and_skip_apply_intents_task_loop_ms = 1; |
830 | | |
831 | | // Write enough rows to trigger the large transaction apply path with kNumApplyLargeTxnBatches |
832 | | // batches. Wait for post split compaction and validate data before returning. |
833 | 0 | ASSERT_OK(CreateSingleTabletAndSplit(kNumRows)); |
834 | | |
835 | | // At this point, post split compaction has happened, and no apply intent task iterations have |
836 | | // run. If post-split compaction has improperly handled ApplyTransactionState present in |
837 | | // regulardb, e.g. by deleting it, then upon restart, one or both of the new child subtablets will |
838 | | // lose all unapplied data. |
839 | 0 | ASSERT_OK(cluster_->RestartSync()); |
840 | | |
841 | | // If we did not lose any large transaction apply data during post-split compaction, then we |
842 | | // should have all rows present in the database. |
843 | 0 | EXPECT_OK(CheckRowsCount(kNumRows)); |
844 | 0 | } |
845 | | |
846 | | class TabletSplitYedisTableTest : public integration_tests::RedisTableTestBase { |
847 | | protected: |
848 | 0 | int num_tablets() override { return 1; } |
849 | | }; |
850 | | |
851 | 0 | TEST_F(TabletSplitYedisTableTest, BlockSplittingYedisTablet) { |
852 | 0 | constexpr int kNumRows = 10000; |
853 | |
|
854 | 0 | for (int i = 0; i < kNumRows; ++i) { |
855 | 0 | PutKeyValue(Format("$0", i), Format("$0", i)); |
856 | 0 | } |
857 | |
|
858 | 0 | for (const auto& peer : ListTableActiveTabletPeers(mini_cluster(), table_->id())) { |
859 | 0 | ASSERT_OK(peer->shared_tablet()->Flush(tablet::FlushMode::kSync)); |
860 | 0 | } |
861 | |
|
862 | 0 | for (const auto& peer : ListTableActiveTabletLeadersPeers(mini_cluster(), table_->id())) { |
863 | 0 | auto catalog_manager = &CHECK_NOTNULL( |
864 | 0 | ASSERT_RESULT(this->mini_cluster()->GetLeaderMiniMaster()))->catalog_manager(); |
865 | |
|
866 | 0 | auto s = DoSplitTablet(catalog_manager, *peer->shared_tablet()); |
867 | 0 | EXPECT_NOT_OK(s); |
868 | 0 | EXPECT_TRUE(s.IsNotSupported()) << s.ToString(); |
869 | 0 | } |
870 | 0 | } |
871 | | |
872 | | class AutomaticTabletSplitITest : public TabletSplitITest { |
873 | | public: |
874 | 9 | void SetUp() override { |
875 | 9 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tserver_heartbeat_metrics_interval_ms) = 100; |
876 | 9 | TabletSplitITest::SetUp(); |
877 | 9 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_automatic_tablet_splitting) = true; |
878 | 9 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_validate_all_tablet_candidates) = false; |
879 | 9 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_outstanding_tablet_split_limit) = 5; |
880 | 9 | } |
881 | | |
882 | | protected: |
883 | 0 | CHECKED_STATUS FlushAllTabletReplicas(const TabletId& tablet_id) { |
884 | 0 | for (const auto& active_peer : ListTableActiveTabletPeers(cluster_.get(), table_->id())) { |
885 | 0 | if (active_peer->tablet_id() == tablet_id) { |
886 | 0 | RETURN_NOT_OK(active_peer->shared_tablet()->Flush(tablet::FlushMode::kSync)); |
887 | 0 | } |
888 | 0 | } |
889 | 0 | return Status::OK(); |
890 | 0 | } |
891 | | |
892 | | CHECKED_STATUS AutomaticallySplitSingleTablet( |
893 | | const string& tablet_id, int num_rows_per_batch, |
894 | 0 | uint64_t threshold, int* key) { |
895 | 0 | uint64_t current_size = 0; |
896 | 0 | auto cur_num_tablets = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()).size(); |
897 | 0 | while (current_size <= threshold) { |
898 | 0 | RETURN_NOT_OK(WriteRows(num_rows_per_batch, *key)); |
899 | 0 | *key += num_rows_per_batch; |
900 | 0 | auto peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
901 | 0 | LOG(INFO) << "Active peers: " << peers.size(); |
902 | 0 | if (peers.size() == cur_num_tablets + 1) { |
903 | 0 | break; |
904 | 0 | } |
905 | 0 | if (peers.size() != cur_num_tablets) { |
906 | 0 | return STATUS_FORMAT(IllegalState, |
907 | 0 | "Expected number of peers: $0, actual: $1", cur_num_tablets, peers.size()); |
908 | 0 | } |
909 | 0 | auto leader_peer = peers.at(0); |
910 | 0 | for (auto peer : peers) { |
911 | 0 | if (peer->tablet_id() == tablet_id) { |
912 | 0 | leader_peer = peer; |
913 | 0 | break; |
914 | 0 | } |
915 | 0 | } |
916 | | // Flush all replicas of this shard to ensure that even if the leader changed we will be in a |
917 | | // state where yb-master should initiate a split. |
918 | 0 | RETURN_NOT_OK(FlushAllTabletReplicas(leader_peer->tablet_id())); |
919 | 0 | current_size = leader_peer->shared_tablet()->GetCurrentVersionSstFilesSize(); |
920 | 0 | } |
921 | 0 | RETURN_NOT_OK(WaitForTabletSplitCompletion( |
922 | 0 | /* expected_non_split_tablets =*/ cur_num_tablets + 1)); |
923 | 0 | return Status::OK(); |
924 | 0 | } |
925 | | |
926 | 0 | CHECKED_STATUS CompactTablet(const string& tablet_id) { |
927 | 0 | auto peers = ListTabletPeers(cluster_.get(), [&tablet_id](auto peer) { |
928 | 0 | return peer->tablet_id() == tablet_id; |
929 | 0 | }); |
930 | 0 | for (const auto& peer : peers) { |
931 | 0 | const auto tablet = peer->shared_tablet(); |
932 | 0 | RETURN_NOT_OK(tablet->Flush(tablet::FlushMode::kSync)); |
933 | 0 | tablet->ForceRocksDBCompactInTest(); |
934 | 0 | } |
935 | 0 | return Status::OK(); |
936 | 0 | } |
937 | | |
938 | 0 | void SleepForBgTaskIters(int num_iters) { |
939 | 0 | std::this_thread::sleep_for(1ms * |
940 | 0 | (FLAGS_catalog_manager_bg_task_wait_ms * num_iters + |
941 | 0 | FLAGS_tserver_heartbeat_metrics_interval_ms)); |
942 | 0 | std::this_thread::sleep_for((FLAGS_catalog_manager_bg_task_wait_ms * num_iters + |
943 | 0 | FLAGS_tserver_heartbeat_metrics_interval_ms) * 1ms); |
944 | 0 | } |
945 | | }; |
946 | | |
947 | 0 | TEST_F(AutomaticTabletSplitITest, AutomaticTabletSplitting) { |
948 | 0 | constexpr int kNumRowsPerBatch = 1000; |
949 | |
|
950 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_shard_count_per_node) = 1; |
951 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_size_threshold_bytes) = 100_KB; |
952 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_disable_compactions) = true; |
953 | |
|
954 | 0 | int key = 1; |
955 | 0 | CreateSingleTablet(); |
956 | 0 | auto peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
957 | 0 | ASSERT_EQ(peers.size(), 1); |
958 | 0 | ASSERT_OK(AutomaticallySplitSingleTablet(peers.at(0)->tablet_id(), kNumRowsPerBatch, |
959 | 0 | FLAGS_tablet_split_low_phase_size_threshold_bytes, &key)); |
960 | | |
961 | | // Since compaction is off, the tablets should not be further split since they won't have had |
962 | | // their post split compaction. Assert this is true by tripling the number of keys written and |
963 | | // seeing the number of tablets not grow. |
964 | 0 | auto triple_keys = key * 2; |
965 | 0 | while (key < triple_keys) { |
966 | 0 | ASSERT_OK(WriteRows(kNumRowsPerBatch, key)); |
967 | 0 | key += kNumRowsPerBatch; |
968 | 0 | auto peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
969 | 0 | EXPECT_EQ(peers.size(), 2); |
970 | 0 | } |
971 | 0 | } |
972 | | |
973 | | |
974 | 0 | TEST_F(AutomaticTabletSplitITest, TabletSplitHasClusterReplicationInfo) { |
975 | 0 | constexpr int kNumRowsPerBatch = 1000; |
976 | | // This test relies on the fact that the high_phase_size_threshold > force_split_threshold |
977 | | // to ensure that without the placement code the test will fail |
978 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_shard_count_per_node) = 0; |
979 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_high_phase_shard_count_per_node) = 1; |
980 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_size_threshold_bytes) = 50_KB; |
981 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_high_phase_size_threshold_bytes) = 100_KB; |
982 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_force_split_threshold_bytes) = 50_KB; |
983 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_post_split_compaction) = true; |
984 | | // Disable automatic compactions |
985 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_base_background_compactions) = 0; |
986 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_max_background_compactions) = 0; |
987 | | // Disable manual compations from flushes |
988 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_level0_file_num_compaction_trigger) = -1; |
989 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_nodes_per_cloud) = 1; |
990 | |
|
991 | 0 | std::vector<string> clouds = {"cloud1_split", "cloud2_split", "cloud3_split", "cloud4_split"}; |
992 | 0 | std::vector<string> regions = {"rack1_split", "rack2_split", "rack3_split", "rack4_split"}; |
993 | 0 | std::vector<string> zones = {"zone1_split", "zone2_split", "zone3_split", "zone4_split"}; |
994 | | |
995 | | // Create 4 tservers with the placement info from above |
996 | 0 | for (size_t i = 0; i < clouds.size(); i++) { |
997 | 0 | tserver::TabletServerOptions extra_opts = |
998 | 0 | ASSERT_RESULT(tserver::TabletServerOptions::CreateTabletServerOptions()); |
999 | 0 | extra_opts.SetPlacement(clouds.at(i), regions.at(i), zones.at(i)); |
1000 | 0 | auto new_ts = cluster_->num_tablet_servers(); |
1001 | 0 | ASSERT_OK(cluster_->AddTabletServer(extra_opts)); |
1002 | 0 | ASSERT_OK(cluster_->WaitForTabletServerCount(new_ts + 1)); |
1003 | 0 | } |
1004 | | |
1005 | | // Set cluster level placement information using only the first 3 clouds/regions/zones |
1006 | 0 | master::ReplicationInfoPB replication_info; |
1007 | 0 | replication_info.mutable_live_replicas()->set_num_replicas( |
1008 | 0 | narrow_cast<int32_t>(clouds.size() - 1)); |
1009 | 0 | for (size_t i = 0; i < clouds.size() - 1; i++) { |
1010 | 0 | auto* placement_block = replication_info.mutable_live_replicas()->add_placement_blocks(); |
1011 | 0 | auto* cloud_info = placement_block->mutable_cloud_info(); |
1012 | 0 | cloud_info->set_placement_cloud(clouds.at(i)); |
1013 | 0 | cloud_info->set_placement_region(regions.at(i)); |
1014 | 0 | cloud_info->set_placement_zone(zones.at(i)); |
1015 | 0 | placement_block->set_min_num_replicas(1); |
1016 | 0 | } |
1017 | 0 | ASSERT_OK(client_->SetReplicationInfo(replication_info)); |
1018 | | |
1019 | | // Create and split single tablet into 2 partitions |
1020 | | // The split should happen at the high threshold |
1021 | 0 | int key = 1; |
1022 | 0 | CreateSingleTablet(); |
1023 | 0 | auto peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
1024 | 0 | ASSERT_EQ(peers.size(), 1); |
1025 | 0 | ASSERT_OK(AutomaticallySplitSingleTablet(peers.at(0)->tablet_id(), kNumRowsPerBatch, |
1026 | 0 | FLAGS_tablet_split_high_phase_size_threshold_bytes, &key)); |
1027 | |
|
1028 | 0 | peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
1029 | 0 | ASSERT_EQ(peers.size(), 2); |
1030 | 0 | auto tablet_id_to_split = peers.at(0)->tablet_id(); |
1031 | | |
1032 | | // Split one of the 2 tablets to get 3 partitions |
1033 | | // The split should happen at the high threshhold |
1034 | 0 | ASSERT_OK(CompactTablet(tablet_id_to_split)); |
1035 | 0 | ASSERT_OK(AutomaticallySplitSingleTablet(tablet_id_to_split, kNumRowsPerBatch, |
1036 | 0 | FLAGS_tablet_split_high_phase_size_threshold_bytes, &key)); |
1037 | | |
1038 | | // Split one of the 3 remaining tablets to get 4 partitions |
1039 | | // The split should happen at the force split threshold |
1040 | | // We set the high phase > force split to ensure that we split at the force split level |
1041 | | // given the custom placement information |
1042 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_high_phase_size_threshold_bytes) = 300_KB; |
1043 | 0 | tablet_id_to_split = peers.at(1)->tablet_id(); |
1044 | 0 | ASSERT_OK(CompactTablet(tablet_id_to_split)); |
1045 | 0 | ASSERT_OK(AutomaticallySplitSingleTablet(tablet_id_to_split, kNumRowsPerBatch, |
1046 | 0 | FLAGS_tablet_force_split_threshold_bytes, &key)); |
1047 | 0 | } |
1048 | | |
1049 | 0 | TEST_F(AutomaticTabletSplitITest, AutomaticTabletSplittingWaitsForAllPeersCompacted) { |
1050 | 0 | constexpr auto kNumRowsPerBatch = 1000; |
1051 | |
|
1052 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_outstanding_tablet_split_limit) = 5; |
1053 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_shard_count_per_node) = 2; |
1054 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_size_threshold_bytes) = 100_KB; |
1055 | | // Disable post split compaction |
1056 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_post_split_compaction) = true; |
1057 | | // Disable automatic compactions |
1058 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_base_background_compactions) = 0; |
1059 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_max_background_compactions) = 0; |
1060 | | // Disable manual compations from flushes |
1061 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_level0_file_num_compaction_trigger) = -1; |
1062 | |
|
1063 | 0 | int key = 1; |
1064 | 0 | CreateSingleTablet(); |
1065 | 0 | auto peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
1066 | 0 | ASSERT_EQ(peers.size(), 1); |
1067 | 0 | ASSERT_OK(AutomaticallySplitSingleTablet(peers.at(0)->tablet_id(), kNumRowsPerBatch, |
1068 | 0 | FLAGS_tablet_split_low_phase_size_threshold_bytes, &key)); |
1069 | |
|
1070 | 0 | std::unordered_set<string> tablet_ids = ListActiveTabletIdsForTable(cluster_.get(), table_->id()); |
1071 | 0 | ASSERT_EQ(tablet_ids.size(), 2); |
1072 | 0 | auto expected_num_tablets = 2; |
1073 | | |
1074 | | // Compact peers one by one and ensure a tablet is not split until all peers are compacted |
1075 | 0 | for (const auto& tablet_id : tablet_ids) { |
1076 | 0 | auto peers = ListTabletPeers(cluster_.get(), [&tablet_id](auto peer) { |
1077 | 0 | return peer->tablet_id() == tablet_id; |
1078 | 0 | }); |
1079 | 0 | ASSERT_EQ(peers.size(), FLAGS_replication_factor); |
1080 | 0 | for (const auto& peer : peers) { |
1081 | | // We shouldn't have split this tablet yet since not all peers are compacted yet |
1082 | 0 | EXPECT_EQ( |
1083 | 0 | ListTableActiveTabletPeers(cluster_.get(), table_->id()).size(), |
1084 | 0 | expected_num_tablets * FLAGS_replication_factor); |
1085 | | |
1086 | | // Force a manual rocksdb compaction on the peer tablet and wait for it to complete |
1087 | 0 | const auto tablet = peer->shared_tablet(); |
1088 | 0 | ASSERT_OK(tablet->Flush(tablet::FlushMode::kSync)); |
1089 | 0 | tablet->ForceRocksDBCompactInTest(); |
1090 | 0 | ASSERT_OK(LoggedWaitFor( |
1091 | 0 | [peer]() -> Result<bool> { |
1092 | 0 | return peer->tablet_metadata()->has_been_fully_compacted(); |
1093 | 0 | }, |
1094 | 0 | 15s * kTimeMultiplier, |
1095 | 0 | "Wait for post tablet split compaction to be completed for peer: " + peer->tablet_id())); |
1096 | | |
1097 | | // Write enough data to get the tablet into a state where it's large enough for a split |
1098 | 0 | int64_t current_size = 0; |
1099 | 0 | while (current_size <= FLAGS_tablet_split_low_phase_size_threshold_bytes) { |
1100 | 0 | ASSERT_OK(WriteRows(kNumRowsPerBatch, key)); |
1101 | 0 | key += kNumRowsPerBatch; |
1102 | 0 | ASSERT_OK(FlushAllTabletReplicas(tablet_id)); |
1103 | 0 | auto current_size_res = GetMinSstFileSizeAmongAllReplicas(tablet_id); |
1104 | 0 | if (!current_size_res.ok()) { |
1105 | 0 | break; |
1106 | 0 | } |
1107 | 0 | current_size = current_size_res.get(); |
1108 | 0 | } |
1109 | | |
1110 | | // Wait for a potential split to get triggered |
1111 | 0 | std::this_thread::sleep_for( |
1112 | 0 | 2 * (FLAGS_catalog_manager_bg_task_wait_ms * 2ms + FLAGS_raft_heartbeat_interval_ms * 2ms)); |
1113 | 0 | } |
1114 | | |
1115 | | // Now that all peers have been compacted, we expect this tablet to get split. |
1116 | 0 | ASSERT_OK( |
1117 | 0 | WaitForTabletSplitCompletion(++expected_num_tablets)); |
1118 | 0 | } |
1119 | 0 | } |
1120 | | |
1121 | | |
1122 | 0 | TEST_F(AutomaticTabletSplitITest, AutomaticTabletSplittingMovesToNextPhase) { |
1123 | 0 | constexpr int kNumRowsPerBatch = 1000; |
1124 | |
|
1125 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_outstanding_tablet_split_limit) = 5; |
1126 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_size_threshold_bytes) = 50_KB; |
1127 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_high_phase_size_threshold_bytes) = 100_KB; |
1128 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_shard_count_per_node) = 1; |
1129 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_high_phase_shard_count_per_node) = 2; |
1130 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_disable_compactions) = true; |
1131 | |
|
1132 | 0 | auto num_tservers = cluster_->num_tablet_servers(); |
1133 | 0 | const auto this_phase_tablet_lower_limit = |
1134 | 0 | FLAGS_tablet_split_low_phase_shard_count_per_node * num_tservers; |
1135 | 0 | const auto this_phase_tablet_upper_limit = |
1136 | 0 | FLAGS_tablet_split_high_phase_shard_count_per_node * num_tservers; |
1137 | | |
1138 | | // Create table with a number of tablets that puts it into the high phase for tablet splitting. |
1139 | 0 | SetNumTablets(narrow_cast<uint32_t>(this_phase_tablet_lower_limit)); |
1140 | 0 | CreateTable(); |
1141 | |
|
1142 | 0 | auto get_num_tablets = [this]() { |
1143 | 0 | return ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()).size(); |
1144 | 0 | }; |
1145 | |
|
1146 | 0 | auto key = 1; |
1147 | 0 | while (get_num_tablets() < this_phase_tablet_upper_limit) { |
1148 | 0 | ASSERT_OK(WriteRows(kNumRowsPerBatch, key)); |
1149 | 0 | key += kNumRowsPerBatch; |
1150 | 0 | auto peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
1151 | 0 | for (const auto& peer : peers) { |
1152 | | // Flush other replicas of this shard to ensure that even if the leader changed we will be in |
1153 | | // a state where yb-master should initiate a split. |
1154 | 0 | ASSERT_OK(FlushAllTabletReplicas(peer->tablet_id())); |
1155 | 0 | auto peer_tablet = peer->shared_tablet(); |
1156 | 0 | if (!peer_tablet) { |
1157 | | // If this tablet was split after we computed peers above, then the shared_tablet() call may |
1158 | | // return null. |
1159 | 0 | continue; |
1160 | 0 | } |
1161 | 0 | ssize_t size = peer->shared_tablet()->GetCurrentVersionSstFilesSize(); |
1162 | 0 | if (size > FLAGS_tablet_split_high_phase_size_threshold_bytes) { |
1163 | | // Wait for the tablet count to go up by at least one, indicating some tablet was split, or |
1164 | | // for the total number of tablets to put this table outside of the high phase. |
1165 | 0 | ASSERT_OK(WaitFor([&]() { |
1166 | 0 | auto num_tablets = get_num_tablets(); |
1167 | 0 | return num_tablets > peers.size() || num_tablets >= this_phase_tablet_upper_limit; |
1168 | 0 | }, 10s * kTimeMultiplier, "Waiting for split of oversized tablet.")); |
1169 | 0 | } |
1170 | 0 | } |
1171 | 0 | } |
1172 | 0 | EXPECT_EQ(get_num_tablets(), this_phase_tablet_upper_limit); |
1173 | 0 | } |
1174 | | |
1175 | 0 | TEST_F(AutomaticTabletSplitITest, AutomaticTabletSplittingMultiPhase) { |
1176 | 0 | constexpr int kNumRowsPerBatch = RegularBuildVsSanitizers(5000, 1000); |
1177 | |
|
1178 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_size_threshold_bytes) = 10_KB; |
1179 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_high_phase_size_threshold_bytes) = 20_KB; |
1180 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_shard_count_per_node) = 1; |
1181 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_high_phase_shard_count_per_node) = 2; |
1182 | | // Disable automatic compactions, but continue to allow manual compactions. |
1183 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_base_background_compactions) = 0; |
1184 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_max_background_compactions) = 0; |
1185 | | // Disable post split compactions. |
1186 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_post_split_compaction) = true; |
1187 | |
|
1188 | 0 | SetNumTablets(1); |
1189 | 0 | CreateTable(); |
1190 | |
|
1191 | 0 | int key = 1; |
1192 | 0 | const auto num_tservers = cluster_->num_tablet_servers(); |
1193 | 0 | std::unordered_map<TabletId, std::unordered_set<TabletServerId>> tablet_id_to_known_peers; |
1194 | 0 | for (const auto& peer : ListTableActiveTabletPeers(cluster_.get(), table_->id())) { |
1195 | 0 | tablet_id_to_known_peers[peer->tablet_id()].insert(peer->permanent_uuid()); |
1196 | 0 | } |
1197 | |
|
1198 | 0 | size_t num_peers = num_tservers; |
1199 | |
|
1200 | 0 | auto test_phase = [&key, &num_peers, &tablet_id_to_known_peers, this]( |
1201 | 0 | size_t tablet_count_limit, uint64_t split_threshold_bytes) { |
1202 | 0 | while (num_peers < tablet_count_limit) { |
1203 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_automatic_tablet_splitting) = false; |
1204 | 0 | ASSERT_OK(WriteRows(kNumRowsPerBatch, key)); |
1205 | 0 | key += kNumRowsPerBatch; |
1206 | | |
1207 | | // Iterate through all peers (not just leaders) since we require post-split compactions to |
1208 | | // complete on all of a tablet's peers before splitting it. |
1209 | 0 | const auto peers = ListTableActiveTabletPeers(cluster_.get(), table_->id()); |
1210 | 0 | if (peers.size() > num_peers) { |
1211 | | // If a new tablet was formed, it means one of the tablets from the last iteration was |
1212 | | // split. In that case, verify that some peer has greater than the current split threshold |
1213 | | // bytes on disk. Note that it would not have compacted away the post split orphaned bytes |
1214 | | // since automatic compactions are off, so we expect this verification to pass with |
1215 | | // certainty. |
1216 | 0 | int new_peers = 0; |
1217 | 0 | for (const auto& peer : peers) { |
1218 | 0 | if (tablet_id_to_known_peers[peer->tablet_id()].count(peer->permanent_uuid()) == 0) { |
1219 | 0 | ++new_peers; |
1220 | | // Since we've disabled compactions, each post-split subtablet should be larger than the |
1221 | | // split size threshold. |
1222 | 0 | ASSERT_GE(peer->shared_tablet()->GetCurrentVersionSstFilesSize(), |
1223 | 0 | split_threshold_bytes); |
1224 | 0 | tablet_id_to_known_peers[peer->tablet_id()].insert(peer->permanent_uuid()); |
1225 | 0 | } |
1226 | 0 | } |
1227 | | // Should have two new peers per split tablet (on a tserver). |
1228 | 0 | const uint64_t num_new_splits = peers.size() - num_peers; |
1229 | 0 | ASSERT_EQ(new_peers, 2 * num_new_splits); |
1230 | |
|
1231 | 0 | num_peers = peers.size(); |
1232 | 0 | } |
1233 | |
|
1234 | 0 | for (const auto& peer : peers) { |
1235 | 0 | ASSERT_OK(peer->shared_tablet()->Flush(tablet::FlushMode::kSync)); |
1236 | | // Compact each tablet to remove the orphaned post-split data so that it can be split again. |
1237 | 0 | ASSERT_OK(peer->shared_tablet()->ForceFullRocksDBCompact()); |
1238 | 0 | } |
1239 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_automatic_tablet_splitting) = true; |
1240 | 0 | SleepForBgTaskIters(2); |
1241 | 0 | } |
1242 | 0 | }; |
1243 | 0 | test_phase( |
1244 | 0 | FLAGS_tablet_split_low_phase_shard_count_per_node * num_tservers, |
1245 | 0 | FLAGS_tablet_split_low_phase_size_threshold_bytes); |
1246 | 0 | test_phase( |
1247 | 0 | FLAGS_tablet_split_high_phase_shard_count_per_node * num_tservers, |
1248 | 0 | FLAGS_tablet_split_high_phase_size_threshold_bytes); |
1249 | 0 | } |
1250 | | |
1251 | 0 | TEST_F(AutomaticTabletSplitITest, LimitNumberOfOutstandingTabletSplits) { |
1252 | 0 | constexpr int kNumRowsPerBatch = 1000; |
1253 | 0 | constexpr int kTabletSplitLimit = 3; |
1254 | |
|
1255 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_shard_count_per_node) = 1; |
1256 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_size_threshold_bytes) = 0; |
1257 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_high_phase_size_threshold_bytes) = 0; |
1258 | | |
1259 | | // Limit the number of tablet splits. |
1260 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_outstanding_tablet_split_limit) = kTabletSplitLimit; |
1261 | | // Start with candidate processing off. |
1262 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_automatic_tablet_splitting) = false; |
1263 | | |
1264 | | // Randomly fail a percentage of tablet splits to ensure that failed splits get removed. |
1265 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_fail_tablet_split_probability) = IsTsan() ? 0.1 : 0.2; |
1266 | | |
1267 | | // Create a table with kTabletSplitLimit tablets. |
1268 | 0 | int num_tablets = kTabletSplitLimit; |
1269 | 0 | SetNumTablets(num_tablets); |
1270 | 0 | CreateTable(); |
1271 | | // Add some data. |
1272 | 0 | ASSERT_OK(WriteRows(kNumRowsPerBatch, 1)); |
1273 | | |
1274 | | // Main test loop: |
1275 | | // Each loop we will split kTabletSplitLimit tablets with post split compactions disabled. |
1276 | | // We will then wait until we have that many tablets split, at which point we will reenable post |
1277 | | // split compactions. |
1278 | | // We will then wait until the post split compactions are done, then repeat. |
1279 | 0 | auto peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
1280 | 0 | for (int split_round = 0; split_round < 3; ++split_round) { |
1281 | 0 | for (const auto& peer : peers) { |
1282 | | // Flush other replicas of this shard to ensure that even if the leader changed we will be in |
1283 | | // a state where yb-master should initiate a split. |
1284 | 0 | ASSERT_OK(FlushAllTabletReplicas(peer->tablet_id())); |
1285 | 0 | } |
1286 | | |
1287 | | // Keep tablets without compaction after split. |
1288 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_before_post_split_compaction) = true; |
1289 | | // Enable splitting. |
1290 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_automatic_tablet_splitting) = true; |
1291 | |
|
1292 | 0 | ASSERT_OK(WaitForTabletSplitCompletion(num_tablets + kTabletSplitLimit)); |
1293 | | // Ensure that we don't split any more tablets. |
1294 | 0 | SleepForBgTaskIters(2); |
1295 | 0 | ASSERT_NOK(WaitForTabletSplitCompletion( |
1296 | 0 | num_tablets + kTabletSplitLimit + 1, // expected_non_split_tablets |
1297 | 0 | 0, // expected_split_tablets (default) |
1298 | 0 | 0, // num_replicas_online (default) |
1299 | 0 | client::kTableName, // table (default) |
1300 | 0 | false)); // core_dump_on_failure |
1301 | | |
1302 | | // Pause any more tablet splits. |
1303 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_automatic_tablet_splitting) = false; |
1304 | | // Reenable post split compaction, wait for this to complete so next tablets can be split. |
1305 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_before_post_split_compaction) = false; |
1306 | |
|
1307 | 0 | ASSERT_OK(WaitForTestTablePostSplitTabletsFullyCompacted(15s * kTimeMultiplier)); |
1308 | |
|
1309 | 0 | peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
1310 | 0 | num_tablets = narrow_cast<int32_t>(peers.size()); |
1311 | | |
1312 | | // There should be kTabletSplitLimit intial tablets + kTabletSplitLimit new tablets per loop. |
1313 | 0 | EXPECT_EQ(num_tablets, (split_round + 2) * kTabletSplitLimit); |
1314 | 0 | } |
1315 | | |
1316 | | // TODO (jhe) For now we need to manually delete the cluster otherwise the cluster verifier can |
1317 | | // get stuck waiting for tablets that got registered but whose tablet split got cancelled by |
1318 | | // FLAGS_TEST_fail_tablet_split_probability. |
1319 | | // We should either have a way to wait for these tablets to get split, or have a way to delete |
1320 | | // these tablets in case a tablet split fails. |
1321 | 0 | cluster_->Shutdown(); |
1322 | 0 | } |
1323 | | |
1324 | 0 | TEST_F(AutomaticTabletSplitITest, DroppedTablesExcludedFromOutstandingSplitLimit) { |
1325 | 0 | constexpr int kNumRowsPerBatch = 1000; |
1326 | 0 | constexpr int kTabletSplitLimit = 1; |
1327 | 0 | constexpr int kNumInitialTablets = 1; |
1328 | | |
1329 | | // Limit the number of tablet splits. |
1330 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_outstanding_tablet_split_limit) = kTabletSplitLimit; |
1331 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_shard_count_per_node) = 1; |
1332 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_size_threshold_bytes) = 0; |
1333 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_disable_compactions) = true; |
1334 | |
|
1335 | 0 | SetNumTablets(kNumInitialTablets); |
1336 | 0 | CreateTable(); |
1337 | 0 | auto table1_peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
1338 | 0 | ASSERT_EQ(table1_peers.size(), 1); |
1339 | 0 | ASSERT_OK(WriteRows(kNumRowsPerBatch, 1)); |
1340 | | // Flush to ensure an SST file is generated so splitting can occur. |
1341 | 0 | ASSERT_OK(FlushAllTabletReplicas(table1_peers[0]->tablet_id())); |
1342 | 0 | ASSERT_OK(WaitForTabletSplitCompletion(kNumInitialTablets + 1)); |
1343 | |
|
1344 | 0 | client::TableHandle table2; |
1345 | 0 | auto table2_name = client::YBTableName(YQL_DATABASE_CQL, "my_keyspace", "ql_client_test_table_2"); |
1346 | 0 | client::kv_table_test::CreateTable( |
1347 | 0 | client::Transactional(true), kNumInitialTablets, client_.get(), &table2, table2_name); |
1348 | 0 | auto table2_peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table2->id()); |
1349 | 0 | ASSERT_EQ(table2_peers.size(), 1); |
1350 | 0 | ASSERT_OK(WriteRows(&table2, kNumRowsPerBatch, 1)); |
1351 | | // Flush to ensure an SST file is generated so splitting can occur. |
1352 | 0 | ASSERT_OK(FlushAllTabletReplicas(table2_peers[0]->tablet_id())); |
1353 | | |
1354 | | // The tablet should not split while the split for the first table is outstanding. |
1355 | 0 | SleepForBgTaskIters(2); |
1356 | 0 | ASSERT_NOK(WaitForTabletSplitCompletion( |
1357 | 0 | kNumInitialTablets + 1, // expected_non_split_tablets |
1358 | 0 | 0, // expected_split_tablets (default) |
1359 | 0 | 0, // num_replicas_online (default) |
1360 | 0 | table2_name, // table |
1361 | 0 | false)); // core_dump_on_failure |
1362 | | |
1363 | | // After deleting the first table, its split should no longer be counted for an ongoing split, so |
1364 | | // the second table's tablet should split. |
1365 | 0 | ASSERT_OK(client_->DeleteTable(client::kTableName)); |
1366 | 0 | ASSERT_OK(WaitForTabletSplitCompletion( |
1367 | 0 | kNumInitialTablets + 1, // expected_non_split_tablets |
1368 | 0 | 0, // expected_split_tablets (default) |
1369 | 0 | 0, // num_replicas_online (default) |
1370 | 0 | table2_name)); // table |
1371 | 0 | } |
1372 | | |
1373 | 0 | TEST_F(AutomaticTabletSplitITest, IncludeTasksInOutstandingSplits) { |
1374 | 0 | constexpr int kNumRowsPerBatch = 1000; |
1375 | 0 | constexpr int kInitialNumTablets = 2; |
1376 | | |
1377 | | // Only allow one tablet split to start. |
1378 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_outstanding_tablet_split_limit) = 1; |
1379 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_shard_count_per_node) = 100; |
1380 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_size_threshold_bytes) = 0; |
1381 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_tserver_get_split_key) = true; |
1382 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_reject_delete_not_serving_tablet_rpc) = true; |
1383 | | |
1384 | | // Start with two tablets. Only one should be chosen for splitting, and it should stall (but be |
1385 | | // counted as outstanding) until the pause is removed. |
1386 | 0 | SetNumTablets(kInitialNumTablets); |
1387 | 0 | CreateTable(); |
1388 | 0 | auto peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
1389 | 0 | ASSERT_EQ(peers.size(), kInitialNumTablets); |
1390 | 0 | ASSERT_OK(WriteRows(kNumRowsPerBatch, 1)); |
1391 | | // Flush to ensure an SST file is generated so splitting can occur. |
1392 | 0 | for (const auto& peer : peers) { |
1393 | 0 | ASSERT_OK(FlushAllTabletReplicas(peer->tablet_id())); |
1394 | 0 | } |
1395 | | // Assert that the other tablet does not get split. |
1396 | 0 | SleepForBgTaskIters(2); |
1397 | 0 | ASSERT_NOK(WaitForTabletSplitCompletion( |
1398 | 0 | kInitialNumTablets + 1, // expected_non_split_tablets |
1399 | 0 | 1, // expected_split_tablets |
1400 | 0 | 0, // num_replicas_online (default) |
1401 | 0 | client::kTableName, // table (default) |
1402 | 0 | false)); // core_dump_on_failure |
1403 | | |
1404 | | // Allow no new splits. The stalled split (and no other splits) should finish after the pause is |
1405 | | // removed. |
1406 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_outstanding_tablet_split_limit) = 0; |
1407 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_tserver_get_split_key) = false; |
1408 | 0 | ASSERT_OK(WaitForTabletSplitCompletion(kInitialNumTablets + 1, /* expected_non_split_tablets */ |
1409 | 0 | 1 /* expected_split_tablets */)); |
1410 | 0 | SleepForBgTaskIters(2); |
1411 | 0 | ASSERT_NOK(WaitForTabletSplitCompletion( |
1412 | 0 | kInitialNumTablets + 2, // expected_non_split_tablets |
1413 | 0 | 1, // expected_split_tablets |
1414 | 0 | 0, // num_replicas_online (default) |
1415 | 0 | client::kTableName, // table (default) |
1416 | 0 | false)); // core_dump_on_failure |
1417 | 0 | } |
1418 | | |
1419 | 0 | TEST_F(AutomaticTabletSplitITest, FailedSplitIsRestarted) { |
1420 | 0 | constexpr int kNumRowsPerBatch = 1000; |
1421 | |
|
1422 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_shard_count_per_node) = 1; |
1423 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_size_threshold_bytes) = 0; |
1424 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_disable_compactions) = true; |
1425 | | // Fail the split on the tserver. |
1426 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_fail_tablet_split_probability) = 1; |
1427 | |
|
1428 | 0 | CreateSingleTablet(); |
1429 | 0 | auto peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
1430 | 0 | ASSERT_EQ(peers.size(), 1); |
1431 | 0 | ASSERT_OK(WriteRows(kNumRowsPerBatch, 1)); |
1432 | | // Flush to ensure an SST file is generated so splitting can occur. |
1433 | 0 | ASSERT_OK(FlushAllTabletReplicas(peers[0]->tablet_id())); |
1434 | | |
1435 | | // The split should fail because of the test flag. |
1436 | 0 | SleepForBgTaskIters(2); |
1437 | 0 | ASSERT_NOK(WaitForTabletSplitCompletion( |
1438 | 0 | 2, // expected_non_split_tablets |
1439 | 0 | 0, // expected_split_tablets (default) |
1440 | 0 | 0, // num_replicas_online (default) |
1441 | 0 | client::kTableName, // table (default) |
1442 | 0 | false)); // core_dump_on_failure |
1443 | |
|
1444 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_fail_tablet_split_probability) = 0; |
1445 | 0 | ASSERT_OK(WaitForTabletSplitCompletion(2)); |
1446 | 0 | } |
1447 | | |
1448 | | class TabletSplitSingleServerITest : public TabletSplitITest { |
1449 | | protected: |
1450 | 5 | int64_t GetRF() override { return 1; } |
1451 | | |
1452 | 0 | Result<tablet::TabletPeerPtr> GetSingleTabletLeaderPeer() { |
1453 | 0 | auto peers = ListTableActiveTabletLeadersPeers(cluster_.get(), table_->id()); |
1454 | 0 | SCHECK_EQ(peers.size(), 1U, IllegalState, "Expected only a single tablet leader."); |
1455 | 0 | return peers.at(0); |
1456 | 0 | } |
1457 | | }; |
1458 | | |
1459 | 0 | TEST_F(TabletSplitSingleServerITest, TabletServerGetSplitKey) { |
1460 | 0 | constexpr auto kNumRows = kDefaultNumRows; |
1461 | | // Setup table with rows. |
1462 | 0 | CreateSingleTablet(); |
1463 | 0 | ASSERT_OK(WriteRowsAndGetMiddleHashCode(kNumRows)); |
1464 | 0 | const auto source_tablet_id = |
1465 | 0 | ASSERT_RESULT(GetSingleTestTabletInfo(ASSERT_RESULT(catalog_manager())))->id(); |
1466 | | |
1467 | | // Flush tablet and directly compute expected middle key. |
1468 | 0 | auto tablet_peer = ASSERT_RESULT(GetSingleTabletLeaderPeer()); |
1469 | 0 | ASSERT_OK(tablet_peer->shared_tablet()->Flush(tablet::FlushMode::kSync)); |
1470 | 0 | auto middle_key = ASSERT_RESULT(tablet_peer->shared_tablet()->GetEncodedMiddleSplitKey()); |
1471 | 0 | auto expected_middle_key_hash = CHECK_RESULT(docdb::DocKey::DecodeHash(middle_key)); |
1472 | | |
1473 | | // Send RPC. |
1474 | 0 | auto resp = ASSERT_RESULT(GetSplitKey(source_tablet_id)); |
1475 | | |
1476 | | // Validate response. |
1477 | 0 | CHECK(!resp.has_error()) << resp.error().DebugString(); |
1478 | 0 | auto decoded_split_key_hash = CHECK_RESULT(docdb::DocKey::DecodeHash(resp.split_encoded_key())); |
1479 | 0 | CHECK_EQ(decoded_split_key_hash, expected_middle_key_hash); |
1480 | 0 | auto decoded_partition_key_hash = PartitionSchema::DecodeMultiColumnHashValue( |
1481 | 0 | resp.split_partition_key()); |
1482 | 0 | CHECK_EQ(decoded_partition_key_hash, expected_middle_key_hash); |
1483 | 0 | } |
1484 | | |
1485 | 0 | TEST_F(TabletSplitSingleServerITest, TabletServerOrphanedPostSplitData) { |
1486 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_disable_compactions) = true; |
1487 | 0 | constexpr auto kNumRows = 2000; |
1488 | |
|
1489 | 0 | auto source_tablet_id = CreateSingleTabletAndSplit(kNumRows); |
1490 | | |
1491 | | // Try to call GetSplitKey RPC on each child tablet that resulted from the split above |
1492 | 0 | const auto& peers = ListTableActiveTabletPeers(cluster_.get(), table_->id()); |
1493 | 0 | ASSERT_EQ(peers.size(), 2); |
1494 | |
|
1495 | 0 | for (const auto& peer : peers) { |
1496 | | // Send RPC to child tablet. |
1497 | 0 | auto resp = ASSERT_RESULT(GetSplitKey(peer->tablet_id())); |
1498 | | |
1499 | | // Validate response |
1500 | 0 | EXPECT_TRUE(resp.has_error()); |
1501 | 0 | EXPECT_TRUE(resp.error().has_status()); |
1502 | 0 | EXPECT_TRUE(resp.error().status().has_message()); |
1503 | 0 | EXPECT_EQ(resp.error().status().code(), |
1504 | 0 | yb::AppStatusPB::ErrorCode::AppStatusPB_ErrorCode_ILLEGAL_STATE); |
1505 | 0 | EXPECT_EQ(resp.error().status().message(), "Tablet has orphaned post-split data"); |
1506 | 0 | } |
1507 | 0 | } |
1508 | | |
1509 | 0 | TEST_F(TabletSplitSingleServerITest, TabletServerSplitAlreadySplitTablet) { |
1510 | 0 | constexpr auto kNumRows = 2000; |
1511 | |
|
1512 | 0 | CreateSingleTablet(); |
1513 | 0 | auto split_hash_code = ASSERT_RESULT(WriteRowsAndGetMiddleHashCode(kNumRows)); |
1514 | 0 | auto tablet_peer = ASSERT_RESULT(GetSingleTabletLeaderPeer()); |
1515 | 0 | const auto tserver_uuid = tablet_peer->permanent_uuid(); |
1516 | |
|
1517 | 0 | SetAtomicFlag(true, &FLAGS_TEST_skip_deleting_split_tablets); |
1518 | 0 | const auto source_tablet_id = ASSERT_RESULT(SplitSingleTablet(split_hash_code)); |
1519 | 0 | ASSERT_OK(WaitForTabletSplitCompletion( |
1520 | 0 | /* expected_non_split_tablets =*/ 2, /* expected_split_tablets = */ 1)); |
1521 | |
|
1522 | 0 | auto send_split_request = [this, &tserver_uuid, &source_tablet_id]() |
1523 | 0 | -> Result<tserver::SplitTabletResponsePB> { |
1524 | 0 | auto tserver = cluster_->mini_tablet_server(0); |
1525 | 0 | auto ts_admin_service_proxy = std::make_unique<tserver::TabletServerAdminServiceProxy>( |
1526 | 0 | proxy_cache_.get(), HostPort::FromBoundEndpoint(tserver->bound_rpc_addr())); |
1527 | 0 | tablet::SplitTabletRequestPB req; |
1528 | 0 | req.set_dest_uuid(tserver_uuid); |
1529 | 0 | req.set_tablet_id(source_tablet_id); |
1530 | 0 | req.set_new_tablet1_id(Format("$0$1", source_tablet_id, "1")); |
1531 | 0 | req.set_new_tablet2_id(Format("$0$1", source_tablet_id, "2")); |
1532 | 0 | req.set_split_partition_key("abc"); |
1533 | 0 | req.set_split_encoded_key("def"); |
1534 | 0 | rpc::RpcController controller; |
1535 | 0 | controller.set_timeout(kRpcTimeout); |
1536 | 0 | tserver::SplitTabletResponsePB resp; |
1537 | 0 | RETURN_NOT_OK(ts_admin_service_proxy->SplitTablet(req, &resp, &controller)); |
1538 | 0 | return resp; |
1539 | 0 | }; |
1540 | | |
1541 | | // If the parent tablet is still around, this should trigger an AlreadyPresent error |
1542 | 0 | auto resp = ASSERT_RESULT(send_split_request()); |
1543 | 0 | EXPECT_TRUE(resp.has_error()); |
1544 | 0 | EXPECT_TRUE(StatusFromPB(resp.error().status()).IsAlreadyPresent()) << resp.error().DebugString(); |
1545 | |
|
1546 | 0 | SetAtomicFlag(false, &FLAGS_TEST_skip_deleting_split_tablets); |
1547 | 0 | ASSERT_OK(WaitForTabletSplitCompletion(/* expected_non_split_tablets =*/ 2)); |
1548 | | |
1549 | | // If the parent tablet has been cleaned up, this should trigger a Not Found error. |
1550 | 0 | resp = ASSERT_RESULT(send_split_request()); |
1551 | 0 | EXPECT_TRUE(resp.has_error()); |
1552 | 0 | EXPECT_TRUE( |
1553 | 0 | StatusFromPB(resp.error().status()).IsNotFound() || |
1554 | 0 | resp.error().code() == tserver::TabletServerErrorPB::TABLET_NOT_FOUND) |
1555 | 0 | << resp.error().DebugString(); |
1556 | 0 | } |
1557 | | |
1558 | 1 | TEST_F(TabletSplitExternalMiniClusterITest, Simple) { |
1559 | 1 | CreateSingleTablet(); |
1560 | 1 | CHECK_OK(WriteRowsAndFlush()); |
1561 | 1 | auto tablet_id = CHECK_RESULT(GetOnlyTestTabletId()); |
1562 | 1 | CHECK_OK(SplitTablet(tablet_id)); |
1563 | 1 | ASSERT_OK(WaitForTablets(3)); |
1564 | 1 | } |
1565 | | |
1566 | 1 | TEST_F(TabletSplitExternalMiniClusterITest, CrashMasterDuringSplit) { |
1567 | 1 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tserver_heartbeat_metrics_interval_ms) = |
1568 | 1 | FLAGS_heartbeat_interval_ms + 1000; |
1569 | | |
1570 | 1 | ASSERT_OK(SplitTabletCrashMaster(false, nullptr)); |
1571 | 1 | } |
1572 | | |
1573 | 1 | TEST_F(TabletSplitExternalMiniClusterITest, CrashMasterCheckConsistentPartitionKeys) { |
1574 | | // Tests that when master crashes during a split and a new split key is used |
1575 | | // we will revert to an older boundary used by the inital split |
1576 | | // Used to validate the fix for: https://github.com/yugabyte/yugabyte-db/issues/8148 |
1577 | 1 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_tserver_heartbeat_metrics_interval_ms) = |
1578 | 1 | FLAGS_heartbeat_interval_ms + 1000; |
1579 | | |
1580 | 1 | string split_partition_key; |
1581 | 1 | ASSERT_OK(SplitTabletCrashMaster(true, &split_partition_key)); |
1582 | | |
1583 | 1 | auto tablets = CHECK_RESULT(ListTablets()); |
1584 | 1 | ASSERT_EQ(tablets.size(), 2); |
1585 | 1 | auto part = tablets.at(0).tablet_status().partition(); |
1586 | 1 | auto part2 = tablets.at(1).tablet_status().partition(); |
1587 | | |
1588 | | // check that both partitions have the same boundary |
1589 | 1 | if (part.partition_key_end() == part2.partition_key_start() && part.partition_key_end() != "") { |
1590 | 0 | ASSERT_EQ(part.partition_key_end(), split_partition_key); |
1591 | 1 | } else { |
1592 | 1 | ASSERT_EQ(part.partition_key_start(), split_partition_key); |
1593 | 1 | ASSERT_EQ(part2.partition_key_end(), split_partition_key); |
1594 | 1 | } |
1595 | 1 | } |
1596 | | |
1597 | 1 | TEST_F(TabletSplitExternalMiniClusterITest, FaultedSplitNodeRejectsRemoteBootstrap) { |
1598 | 1 | constexpr int kTabletSplitInjectDelayMs = 20000 * kTimeMultiplier; |
1599 | 1 | CreateSingleTablet(); |
1600 | 1 | ASSERT_OK(WriteRowsAndFlush()); |
1601 | 1 | const auto tablet_id = CHECK_RESULT(GetOnlyTestTabletId()); |
1602 | | |
1603 | 1 | const auto leader_idx = CHECK_RESULT(cluster_->GetTabletLeaderIndex(tablet_id)); |
1604 | 1 | const auto healthy_follower_idx = (leader_idx + 1) % 3; |
1605 | 1 | const auto faulted_follower_idx = (leader_idx + 2) % 3; |
1606 | | |
1607 | 1 | auto faulted_follower = cluster_->tablet_server(faulted_follower_idx); |
1608 | 1 | ASSERT_OK(cluster_->SetFlag( |
1609 | 1 | faulted_follower, "TEST_crash_before_apply_tablet_split_op", "true")); |
1610 | | |
1611 | 1 | ASSERT_OK(SplitTablet(tablet_id)); |
1612 | 1 | ASSERT_OK(cluster_->WaitForTSToCrash(faulted_follower)); |
1613 | | |
1614 | 1 | ASSERT_OK(faulted_follower->Restart(ExternalMiniClusterOptions::kDefaultStartCqlProxy, { |
1615 | 1 | std::make_pair( |
1616 | 1 | "TEST_apply_tablet_split_inject_delay_ms", Format("$0", kTabletSplitInjectDelayMs)) |
1617 | 1 | })); |
1618 | | |
1619 | 1 | consensus::StartRemoteBootstrapRequestPB req; |
1620 | 1 | req.set_split_parent_tablet_id(tablet_id); |
1621 | 1 | req.set_dest_uuid(faulted_follower->uuid()); |
1622 | | // We put some bogus values for these next two required fields. |
1623 | 1 | req.set_tablet_id("::std::string &&value"); |
1624 | 1 | req.set_bootstrap_peer_uuid("abcdefg"); |
1625 | 1 | consensus::StartRemoteBootstrapResponsePB resp; |
1626 | 1 | rpc::RpcController rpc; |
1627 | 1 | rpc.set_timeout(kRpcTimeout); |
1628 | 1 | auto s = cluster_->GetConsensusProxy(faulted_follower).StartRemoteBootstrap(req, &resp, &rpc); |
1629 | 1 | EXPECT_OK(s); |
1630 | 1 | EXPECT_TRUE(resp.has_error()); |
1631 | 1 | EXPECT_EQ(resp.error().code(), tserver::TabletServerErrorPB::TABLET_SPLIT_PARENT_STILL_LIVE); |
1632 | | |
1633 | 1 | SleepFor(1ms * kTabletSplitInjectDelayMs); |
1634 | 1 | EXPECT_OK(WaitForTablets(2)); |
1635 | 1 | EXPECT_OK(WaitForTablets(2, faulted_follower_idx)); |
1636 | | |
1637 | | // By shutting down the healthy follower and writing rows to the table, we ensure the faulted |
1638 | | // follower is eventually able to rejoin the raft group. |
1639 | 1 | auto healthy_follower = cluster_->tablet_server(healthy_follower_idx); |
1640 | 1 | healthy_follower->Shutdown(); |
1641 | 1 | EXPECT_OK(WaitFor([&]() -> Result<bool> { |
1642 | 1 | return WriteRows(10).ok(); |
1643 | 1 | }, 20s * kTimeMultiplier, "Write rows after requiring faulted follower.")); |
1644 | | |
1645 | 1 | ASSERT_OK(healthy_follower->Restart()); |
1646 | 1 | } |
1647 | | |
1648 | 1 | TEST_F(TabletSplitExternalMiniClusterITest, CrashesAfterChildLogCopy) { |
1649 | 1 | ASSERT_OK(cluster_->SetFlagOnMasters("unresponsive_ts_rpc_retry_limit", "0")); |
1650 | | |
1651 | 1 | CreateSingleTablet(); |
1652 | 1 | CHECK_OK(WriteRowsAndFlush()); |
1653 | 1 | const auto tablet_id = CHECK_RESULT(GetOnlyTestTabletId()); |
1654 | | |
1655 | | // We will fault one of the non-leader servers after it performs a WAL Log copy from parent to |
1656 | | // the first child, but before it can mark the child as TABLET_DATA_READY. |
1657 | 1 | const auto leader_idx = CHECK_RESULT(cluster_->GetTabletLeaderIndex(tablet_id)); |
1658 | 1 | const auto faulted_follower_idx = (leader_idx + 2) % 3; |
1659 | 1 | const auto non_faulted_follower_idx = (leader_idx + 1) % 3; |
1660 | | |
1661 | 1 | auto faulted_follower = cluster_->tablet_server(faulted_follower_idx); |
1662 | 1 | CHECK_OK(cluster_->SetFlag( |
1663 | 1 | faulted_follower, "TEST_fault_crash_in_split_after_log_copied", "1.0")); |
1664 | | |
1665 | 1 | CHECK_OK(SplitTablet(tablet_id)); |
1666 | 1 | CHECK_OK(cluster_->WaitForTSToCrash(faulted_follower)); |
1667 | | |
1668 | 1 | CHECK_OK(faulted_follower->Restart()); |
1669 | | |
1670 | 1 | ASSERT_OK(cluster_->WaitForTabletsRunning(faulted_follower, 20s * kTimeMultiplier)); |
1671 | 1 | ASSERT_OK(WaitForTablets(3)); |
1672 | | |
1673 | 1 | ASSERT_OK(WaitFor([&]() -> Result<bool> { |
1674 | 1 | return WriteRows().ok(); |
1675 | 1 | }, 20s * kTimeMultiplier, "Write rows after faulted follower resurrection.")); |
1676 | | |
1677 | 1 | auto non_faulted_follower = cluster_->tablet_server(non_faulted_follower_idx); |
1678 | 1 | non_faulted_follower->Shutdown(); |
1679 | 1 | CHECK_OK(cluster_->WaitForTSToCrash(non_faulted_follower)); |
1680 | | |
1681 | 1 | ASSERT_OK(WaitFor([&]() -> Result<bool> { |
1682 | 1 | return WriteRows().ok(); |
1683 | 1 | }, 20s * kTimeMultiplier, "Write rows after requiring bootstraped node consensus.")); |
1684 | | |
1685 | 1 | CHECK_OK(non_faulted_follower->Restart()); |
1686 | 1 | } |
1687 | | |
1688 | | class TabletSplitRemoteBootstrapEnabledTest : public TabletSplitExternalMiniClusterITest { |
1689 | | protected: |
1690 | 1 | void SetFlags() override { |
1691 | 1 | TabletSplitExternalMiniClusterITest::SetFlags(); |
1692 | 1 | mini_cluster_opt_.extra_tserver_flags.push_back( |
1693 | 1 | "--TEST_disable_post_split_tablet_rbs_check=true"); |
1694 | 1 | } |
1695 | | }; |
1696 | | |
1697 | 1 | TEST_F(TabletSplitRemoteBootstrapEnabledTest, TestSplitAfterFailedRbsCreatesDirectories) { |
1698 | 1 | const auto kApplyTabletSplitDelay = 15s * kTimeMultiplier; |
1699 | | |
1700 | 1 | const auto get_tablet_meta_dirs = |
1701 | 100 | [this](ExternalTabletServer* node) -> Result<std::vector<string>> { |
1702 | 100 | auto tablet_meta_dirs = VERIFY_RESULT(env_->GetChildren( |
1703 | 100 | JoinPathSegments(node->GetRootDir(), "yb-data", "tserver", "tablet-meta"))); |
1704 | 100 | std::sort(tablet_meta_dirs.begin(), tablet_meta_dirs.end()); |
1705 | 100 | return tablet_meta_dirs; |
1706 | 100 | }; |
1707 | | |
1708 | 1 | const auto wait_for_same_tablet_metas = |
1709 | 1 | [&get_tablet_meta_dirs] |
1710 | 2 | (ExternalTabletServer* node_1, ExternalTabletServer* node_2) -> Status { |
1711 | 50 | return WaitFor([node_1, node_2, &get_tablet_meta_dirs]() -> Result<bool> { |
1712 | 50 | auto node_1_metas = VERIFY_RESULT(get_tablet_meta_dirs(node_1)); |
1713 | 50 | auto node_2_metas = VERIFY_RESULT(get_tablet_meta_dirs(node_2)); |
1714 | 50 | if (node_1_metas.size() != node_2_metas.size()) { |
1715 | 48 | return false; |
1716 | 48 | } |
1717 | 18 | for (size_t i = 0; i < node_2_metas.size(); ++i) { |
1718 | 16 | if (node_1_metas.at(i) != node_2_metas.at(i)) { |
1719 | 0 | return false; |
1720 | 0 | } |
1721 | 16 | } |
1722 | 2 | return true; |
1723 | 2 | }, 5s * kTimeMultiplier, "Waiting for nodes to have same set of tablet metas."); |
1724 | 2 | }; |
1725 | | |
1726 | 1 | CreateSingleTablet(); |
1727 | 1 | ASSERT_OK(WriteRowsAndFlush()); |
1728 | 1 | const auto tablet_id = CHECK_RESULT(GetOnlyTestTabletId()); |
1729 | | |
1730 | 1 | const auto leader_idx = CHECK_RESULT(cluster_->GetTabletLeaderIndex(tablet_id)); |
1731 | 1 | const auto leader = cluster_->tablet_server(leader_idx); |
1732 | 1 | const auto healthy_follower_idx = (leader_idx + 1) % 3; |
1733 | 1 | const auto healthy_follower = cluster_->tablet_server(healthy_follower_idx); |
1734 | 1 | const auto faulted_follower_idx = (leader_idx + 2) % 3; |
1735 | 1 | const auto faulted_follower = cluster_->tablet_server(faulted_follower_idx); |
1736 | | |
1737 | | // Make one node fail on tablet split, and ensure the leader does not remote bootstrap to it at |
1738 | | // first. |
1739 | 1 | ASSERT_OK(cluster_->SetFlag( |
1740 | 1 | faulted_follower, "TEST_crash_before_apply_tablet_split_op", "true")); |
1741 | 1 | ASSERT_OK(cluster_->SetFlag(leader, "TEST_enable_remote_bootstrap", "false")); |
1742 | 1 | ASSERT_OK(SplitTablet(tablet_id)); |
1743 | 1 | ASSERT_OK(cluster_->WaitForTSToCrash(faulted_follower)); |
1744 | | |
1745 | | // Once split is applied on two nodes, re-enable remote bootstrap before restarting the faulted |
1746 | | // node. Ensure that remote bootstrap requests can be retried until the faulted node is up. |
1747 | 1 | ASSERT_OK(WaitForTablets(3, leader_idx)); |
1748 | 1 | ASSERT_OK(WaitForTablets(3, healthy_follower_idx)); |
1749 | 1 | ASSERT_OK(wait_for_same_tablet_metas(leader, healthy_follower)); |
1750 | 1 | ASSERT_OK(cluster_->SetFlag(leader, "unresponsive_ts_rpc_retry_limit", "100")); |
1751 | 1 | ASSERT_OK(cluster_->SetFlag(leader, "TEST_enable_remote_bootstrap", "true")); |
1752 | | |
1753 | | // Restart the faulted node. Ensure it waits a long time in ApplyTabletSplit to allow a remote |
1754 | | // bootstrap request to come in and create a directory for the subtablets before returning error. |
1755 | 1 | ASSERT_OK(faulted_follower->Restart(ExternalMiniClusterOptions::kDefaultStartCqlProxy, { |
1756 | 1 | std::make_pair( |
1757 | 1 | "TEST_apply_tablet_split_inject_delay_ms", |
1758 | 1 | Format("$0", MonoDelta(kApplyTabletSplitDelay).ToMilliseconds())), |
1759 | 1 | std::make_pair("TEST_simulate_already_present_in_remote_bootstrap", "true"), |
1760 | 1 | std::make_pair("TEST_crash_before_apply_tablet_split_op", "false"), |
1761 | 1 | })); |
1762 | | |
1763 | | // Once the faulted node has the same tablet metas written to disk as the leader, disable remote |
1764 | | // bootstrap to avoid registering transition status for the subtablets. |
1765 | 1 | ASSERT_OK(wait_for_same_tablet_metas(leader, faulted_follower)); |
1766 | 1 | ASSERT_OK(cluster_->SetFlagOnTServers("TEST_enable_remote_bootstrap", "false")); |
1767 | | |
1768 | | // Sleep some time to allow the ApplyTabletSplit pause to run out, and then ensure we have healthy |
1769 | | // subtablets at the formerly faulted follower node. |
1770 | 1 | std::this_thread::sleep_for(kApplyTabletSplitDelay); |
1771 | 1 | EXPECT_OK(WaitFor([&]() -> Result<bool> { |
1772 | 1 | return WriteRows().ok(); |
1773 | 1 | }, 10s * kTimeMultiplier, "Write rows after faulted follower resurrection.")); |
1774 | 1 | cluster_->Shutdown(); |
1775 | 1 | } |
1776 | | |
1777 | 1 | TEST_F(TabletSplitExternalMiniClusterITest, RemoteBootstrapsFromNodeWithUncommittedSplitOp) { |
1778 | | // If a new tablet is created and split with one node completely uninvolved, then when that node |
1779 | | // rejoins it will have to do a remote bootstrap. |
1780 | | |
1781 | 1 | const auto server_to_bootstrap_idx = 0; |
1782 | 1 | std::vector<size_t> other_servers; |
1783 | 4 | for (size_t i = 0; i < cluster_->num_tablet_servers(); ++i) { |
1784 | 3 | if (i != server_to_bootstrap_idx) { |
1785 | 2 | other_servers.push_back(i); |
1786 | 2 | } |
1787 | 3 | } |
1788 | | |
1789 | 1 | ASSERT_OK(cluster_->SetFlagOnMasters("unresponsive_ts_rpc_retry_limit", "0")); |
1790 | | |
1791 | 1 | auto server_to_bootstrap = cluster_->tablet_server(server_to_bootstrap_idx); |
1792 | 1 | server_to_bootstrap->Shutdown(); |
1793 | 1 | CHECK_OK(cluster_->WaitForTSToCrash(server_to_bootstrap)); |
1794 | | |
1795 | 1 | CreateSingleTablet(); |
1796 | 1 | const auto other_server_idx = *other_servers.begin(); |
1797 | 1 | const auto tablet_id = CHECK_RESULT(GetOnlyTestTabletId(other_server_idx)); |
1798 | | |
1799 | 1 | CHECK_OK(WriteRows()); |
1800 | 4 | for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { |
1801 | 3 | if (i != server_to_bootstrap_idx) { |
1802 | 2 | ASSERT_OK(FlushTabletsOnSingleTServer(i, {tablet_id}, false)); |
1803 | 2 | } |
1804 | 3 | } |
1805 | | |
1806 | 1 | const auto leader_idx = CHECK_RESULT(cluster_->GetTabletLeaderIndex(tablet_id)); |
1807 | 1 | const auto server_to_kill_idx = 3 - leader_idx - server_to_bootstrap_idx; |
1808 | 1 | auto server_to_kill = cluster_->tablet_server(server_to_kill_idx); |
1809 | | |
1810 | 1 | auto leader = cluster_->tablet_server(leader_idx); |
1811 | 1 | CHECK_OK(cluster_->SetFlag( |
1812 | 1 | server_to_kill, "TEST_fault_crash_in_split_before_log_flushed", "1.0")); |
1813 | 1 | CHECK_OK(cluster_->SetFlag(leader, "TEST_fault_crash_in_split_before_log_flushed", "1.0")); |
1814 | 1 | CHECK_OK(SplitTablet(tablet_id)); |
1815 | | |
1816 | | // The leader is guaranteed to attempt to apply the split operation and crash. |
1817 | 1 | CHECK_OK(cluster_->WaitForTSToCrash(leader)); |
1818 | | // The other follower may or may not attempt to apply the split operation. We shut it down here so |
1819 | | // that it cannot be used for remote bootstrap. |
1820 | 1 | server_to_kill->Shutdown(); |
1821 | | |
1822 | 1 | CHECK_OK(leader->Restart()); |
1823 | 1 | CHECK_OK(server_to_bootstrap->Restart()); |
1824 | | |
1825 | 1 | ASSERT_OK(cluster_->WaitForTabletsRunning(leader, 20s * kTimeMultiplier)); |
1826 | 1 | ASSERT_OK(cluster_->WaitForTabletsRunning(server_to_bootstrap, 20s * kTimeMultiplier)); |
1827 | 1 | CHECK_OK(server_to_kill->Restart()); |
1828 | 1 | ASSERT_OK(WaitForTabletsExcept(2, server_to_bootstrap_idx, tablet_id)); |
1829 | 1 | ASSERT_OK(WaitForTabletsExcept(2, leader_idx, tablet_id)); |
1830 | 1 | ASSERT_OK(WaitForTabletsExcept(2, server_to_kill_idx, tablet_id)); |
1831 | | |
1832 | 1 | ASSERT_OK(WaitFor([&]() -> Result<bool> { |
1833 | 1 | return WriteRows().ok(); |
1834 | 1 | }, 20s * kTimeMultiplier, "Write rows after split.")); |
1835 | | |
1836 | 1 | server_to_kill->Shutdown(); |
1837 | 1 | CHECK_OK(cluster_->WaitForTSToCrash(server_to_kill)); |
1838 | | |
1839 | 1 | ASSERT_OK(WaitFor([&]() -> Result<bool> { |
1840 | 1 | return WriteRows().ok(); |
1841 | 1 | }, 20s * kTimeMultiplier, "Write rows after requiring bootstraped node consensus.")); |
1842 | | |
1843 | 1 | CHECK_OK(server_to_kill->Restart()); |
1844 | 1 | } |
1845 | | |
1846 | | class TabletSplitReplaceNodeITest : public TabletSplitExternalMiniClusterITest { |
1847 | | protected: |
1848 | 1 | void SetFlags() override { |
1849 | 1 | TabletSplitExternalMiniClusterITest::SetFlags(); |
1850 | | |
1851 | 1 | for (const auto& tserver_flag : { |
1852 | | // We want to test behavior of the source tablet, so setting up to skip deleting it. |
1853 | 1 | "--TEST_skip_deleting_split_tablets=true", |
1854 | | // Reduce follower_unavailable_considered_failed_sec, so offline tserver is evicted |
1855 | | // from Raft group faster. |
1856 | 1 | "--follower_unavailable_considered_failed_sec=5", |
1857 | 2 | }) { |
1858 | 2 | mini_cluster_opt_.extra_tserver_flags.push_back(tserver_flag); |
1859 | 2 | } |
1860 | | |
1861 | 1 | for (const auto& master_flag : { |
1862 | | // Should be less than follower_unavailable_considered_failed_sec, so load balancer |
1863 | | // doesn't go into infinite loop trying to add failed follower back. |
1864 | 1 | "--tserver_unresponsive_timeout_ms=3000", |
1865 | | // To speed up load balancing: |
1866 | | // - Allow more concurrent adds/removes, so we deal with transaction status tablets |
1867 | | // faster. |
1868 | 1 | "--load_balancer_max_concurrent_adds=10", "--load_balancer_max_concurrent_removals=10", |
1869 | | // - Allow more over replicated tablets, so temporary child tablets over replication |
1870 | | // doesn't block parent tablet move. |
1871 | 1 | "--load_balancer_max_over_replicated_tablets=5", |
1872 | | // TODO: should be default behaviour after |
1873 | | // https://github.com/yugabyte/yugabyte-db/issues/10301 is fixed. |
1874 | 1 | "--TEST_load_balancer_skip_inactive_tablets=false", |
1875 | 5 | }) { |
1876 | 5 | mini_cluster_opt_.extra_master_flags.push_back(master_flag); |
1877 | 5 | } |
1878 | 1 | } |
1879 | | }; |
1880 | | |
1881 | | TEST_F_EX( |
1882 | 1 | TabletSplitExternalMiniClusterITest, ReplaceNodeForParentTablet, TabletSplitReplaceNodeITest) { |
1883 | 1 | constexpr auto kNumRows = kDefaultNumRows; |
1884 | | |
1885 | 1 | CreateSingleTablet(); |
1886 | 1 | ASSERT_OK(WriteRowsAndFlush(kNumRows)); |
1887 | 1 | const auto source_tablet_id = ASSERT_RESULT(GetOnlyTestTabletId()); |
1888 | 1 | LOG(INFO) << "Source tablet ID: " << source_tablet_id; |
1889 | | |
1890 | 1 | auto* offline_ts = cluster_->tablet_server(0); |
1891 | 1 | offline_ts->Shutdown(); |
1892 | 1 | LOG(INFO) << "Shutdown completed for tserver: " << offline_ts->uuid(); |
1893 | 1 | const auto offline_ts_id = offline_ts->uuid(); |
1894 | | |
1895 | 1 | ASSERT_OK(SplitTablet(source_tablet_id)); |
1896 | 1 | ASSERT_OK(WaitForTablets(3)); |
1897 | | |
1898 | 1 | ASSERT_OK(cluster_->AddTabletServer()); |
1899 | 1 | const auto new_ts_id = cluster_->tablet_server(3)->uuid(); |
1900 | 1 | LOG(INFO) << "Started new tserver: " << new_ts_id; |
1901 | | |
1902 | 1 | ASSERT_OK(cluster_->WaitForTabletServerCount(4, 20s)); |
1903 | 1 | LOG(INFO) << "New tserver has been added: " << new_ts_id; |
1904 | | |
1905 | 1 | const auto deadline = CoarseMonoClock::Now() + 30s * kTimeMultiplier; |
1906 | 1 | std::set<TabletServerId> source_tablet_replicas; |
1907 | 1 | auto s = LoggedWait( |
1908 | 71 | [this, &deadline, &source_tablet_id, &offline_ts_id, &new_ts_id, &source_tablet_replicas] { |
1909 | 71 | const MonoDelta remaining_timeout = deadline - CoarseMonoClock::Now(); |
1910 | 71 | if (remaining_timeout.IsNegative()) { |
1911 | 0 | return false; |
1912 | 0 | } |
1913 | 71 | master::TabletLocationsPB resp; |
1914 | 71 | const auto s = itest::GetTabletLocations( |
1915 | 71 | cluster_.get(), source_tablet_id, remaining_timeout, &resp); |
1916 | 71 | if (!s.ok()) { |
1917 | 0 | return false; |
1918 | 0 | } |
1919 | 71 | source_tablet_replicas.clear(); |
1920 | 203 | for (auto& replica : resp.replicas()) { |
1921 | 203 | source_tablet_replicas.insert(replica.ts_info().permanent_uuid()); |
1922 | 203 | } |
1923 | 71 | if (source_tablet_replicas.size() != 3) { |
1924 | 10 | return false; |
1925 | 10 | } |
1926 | 61 | if (source_tablet_replicas.count(offline_ts_id) > 0) { |
1927 | | // We don't expect source tablet to have replica on offline tserver. |
1928 | 60 | return false; |
1929 | 60 | } |
1930 | 1 | return source_tablet_replicas.count(new_ts_id) > 0; |
1931 | 1 | }, |
1932 | 1 | deadline, |
1933 | 1 | Format("Waiting for source tablet $0 to be moved to ts-4 ($1)", source_tablet_id, new_ts_id)); |
1934 | | |
1935 | 2 | ASSERT_TRUE(s.ok()) << s << ". Source tablet replicas: " << AsString(source_tablet_replicas); |
1936 | | |
1937 | | // Wait for the split to be completed on all online tservers. |
1938 | 5 | for (size_t ts_idx = 0; ts_idx < cluster_->num_tablet_servers(); ++ts_idx) { |
1939 | 4 | if (ts_idx == 3) { |
1940 | | // Skip new TS, because of https://github.com/yugabyte/yugabyte-db/issues/10301. |
1941 | | // TODO(tsplit): remove after it is fixed. |
1942 | 1 | continue; |
1943 | 1 | } |
1944 | 3 | if (cluster_->tablet_server(ts_idx)->IsProcessAlive()) { |
1945 | 2 | ASSERT_OK(WaitForTablets(3, ts_idx)); |
1946 | 2 | } |
1947 | 3 | } |
1948 | | |
1949 | | // Restarting offline_ts, because ClusterVerifier requires all tservers to be online. |
1950 | 1 | ASSERT_OK(offline_ts->Start()); |
1951 | | |
1952 | | // TODO(tsplit): remove after https://github.com/yugabyte/yugabyte-db/issues/10301 is fixed. |
1953 | 1 | DontVerifyClusterBeforeNextTearDown(); |
1954 | 1 | } |
1955 | | |
1956 | | class TabletSplitSystemRecordsITest : |
1957 | | public TabletSplitSingleServerITest, |
1958 | | public testing::WithParamInterface<Partitioning> { |
1959 | | protected: |
1960 | 2 | void SetUp() override { |
1961 | 2 | TabletSplitSingleServerITest::SetUp(); |
1962 | 2 | SetNumTablets(1); |
1963 | | |
1964 | | // Disable automatic tablet splitting. |
1965 | 2 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_automatic_tablet_splitting) = false; |
1966 | | // Disable automatic compactions. |
1967 | 2 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_base_background_compactions) = 0; |
1968 | 2 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_max_background_compactions) = 0; |
1969 | | // Disable manual compations from flushes. |
1970 | 2 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_level0_file_num_compaction_trigger) = -1; |
1971 | 2 | } |
1972 | | |
1973 | 0 | CHECKED_STATUS VerifySplitKeyError(yb::tablet::TabletPtr tablet) { |
1974 | 0 | SCHECK_NOTNULL(tablet.get()); |
1975 | | |
1976 | | // Get middle key directly from in-memory tablet and check correct message has been returned. |
1977 | 0 | auto middle_key = tablet->GetEncodedMiddleSplitKey(); |
1978 | 0 | SCHECK_EQ(middle_key.ok(), false, IllegalState, "Valid split key is not expected."); |
1979 | 0 | const auto key_message = middle_key.status().message(); |
1980 | 0 | const auto is_expected_key_message = |
1981 | 0 | strnstr(key_message.cdata(), "got internal record", key_message.size()) != nullptr; |
1982 | 0 | SCHECK_EQ(is_expected_key_message, true, IllegalState, |
1983 | 0 | Format("Unexepected error message: $0", middle_key.status().ToString())); |
1984 | 0 | LOG(INFO) << "System record middle key result: " << middle_key.status().ToString(); |
1985 | | |
1986 | | // Test that tablet GetSplitKey RPC returns the same message. |
1987 | 0 | auto response = VERIFY_RESULT(GetSplitKey(tablet->tablet_id())); |
1988 | 0 | SCHECK_EQ(response.has_error(), true, IllegalState, |
1989 | 0 | "GetSplitKey RPC unexpectedly succeeded."); |
1990 | 0 | const Status op_status = StatusFromPB(response.error().status()); |
1991 | 0 | SCHECK_EQ(op_status.ToString(), middle_key.status().ToString(), IllegalState, |
1992 | 0 | Format("Unexpected error message: $0", op_status.ToString())); |
1993 | 0 | LOG(INFO) << "System record get split key result: " << op_status.ToString(); |
1994 | |
|
1995 | 0 | return Status::OK(); |
1996 | 0 | } |
1997 | | }; |
1998 | | |
1999 | 0 | TEST_P(TabletSplitSystemRecordsITest, GetSplitKey) { |
2000 | | // The idea of the test is to generate data with kNumTxns ApplyTransactionState records following |
2001 | | // by 2 * kNumRows user records (very small number). This can be achieved by the following steps: |
2002 | | // 1) pause ApplyIntentsTasks to keep ApplyTransactionState records |
2003 | | // 2) run kNumTxns transaction with the same keys |
2004 | | // 3) run manual compaction to collapse all user records to the latest transaciton content |
2005 | | // 4) at this step there are kNumTxns internal records followed by 2 * kNumRows user records |
2006 | | |
2007 | | // Selecting a small period for history cutoff to force compacting records with the same keys. |
2008 | 0 | constexpr auto kHistoryRetentionSec = 1; |
2009 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_timestamp_history_retention_interval_sec) |
2010 | 0 | = kHistoryRetentionSec; |
2011 | | |
2012 | | // This flag shoudn't be less than 2, setting it to 1 may cause RemoveIntentsTask to become stuck. |
2013 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_txn_max_apply_batch_records) = 2; |
2014 | | |
2015 | | // Force intents to not apply in ApplyIntentsTask. |
2016 | | // Note: transactions are still partly applied via tablet::UpdateTxnOperation::DoReplicated(), |
2017 | | // but ApplyTransactionState will not be removed from SST during compaction. |
2018 | 0 | ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_and_skip_apply_intents_task_loop_ms) = 1; |
2019 | | |
2020 | | // This combination of txns number and user rows is enough to generate suitable number of |
2021 | | // internal records to make middle key point to one of the ApplyTransactionState records. |
2022 | | // Also this will prevent spawning logs with long operation timeout warnings. |
2023 | 0 | constexpr auto kNumTxns = 50; |
2024 | 0 | constexpr auto kNumRows = 3; |
2025 | |
|
2026 | 0 | Schema schema; |
2027 | 0 | BuildSchema(GetParam(), &schema); |
2028 | 0 | ASSERT_OK(CreateTable(schema)); |
2029 | 0 | auto peer = ASSERT_RESULT(GetSingleTabletLeaderPeer()); |
2030 | 0 | auto tablet = peer->shared_tablet(); |
2031 | 0 | auto partition_schema = tablet->metadata()->partition_schema(); |
2032 | 0 | LOG(INFO) << "System records partitioning: " |
2033 | 0 | << "hash = " << partition_schema->IsHashPartitioning() << ", " |
2034 | 0 | << "range = " << partition_schema->IsRangePartitioning(); |
2035 | |
|
2036 | 0 | for (auto i = 0; i < kNumTxns; ++i) { |
2037 | 0 | ASSERT_OK(WriteRows(&table_, kNumRows, 1, kNumRows * i)); |
2038 | 0 | } |
2039 | | |
2040 | | // Sleep for kHistoryRetentionSec + delta to make sure all the records with the same keys |
2041 | | // will be compacted. Taking into account FLAGS_TEST_pause_and_skip_apply_intents_task_loop_ms |
2042 | | // is set, this leads to a warning for too long ScopedRWOperation (see ProcessApply routine). |
2043 | 0 | std::this_thread::sleep_for(std::chrono::seconds(kHistoryRetentionSec + 1)); |
2044 | | |
2045 | | // Force manual compaction |
2046 | 0 | ASSERT_OK(FlushTestTable()); |
2047 | 0 | ASSERT_OK(tablet->ForceFullRocksDBCompact()); |
2048 | 0 | ASSERT_OK(LoggedWaitFor( |
2049 | 0 | [peer]() -> Result<bool> { |
2050 | 0 | return peer->tablet_metadata()->has_been_fully_compacted(); |
2051 | 0 | }, |
2052 | 0 | 15s * kTimeMultiplier, |
2053 | 0 | "Wait for tablet manual compaction to be completed for peer: " + peer->tablet_id())); |
2054 | |
|
2055 | 0 | ASSERT_OK(VerifySplitKeyError(tablet)); |
2056 | 0 | } |
2057 | | |
2058 | | namespace { |
2059 | | |
2060 | | PB_ENUM_FORMATTERS(IsolationLevel); |
2061 | | |
2062 | | template <typename T> |
2063 | 205 | std::string TestParamToString(const testing::TestParamInfo<T>& param_info) { |
2064 | 205 | return ToString(param_info.param); |
2065 | 205 | } tablet-split-itest.cc:_ZN2yb12_GLOBAL__N_117TestParamToStringINS_14IsolationLevelEEENSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEERKN7testing13TestParamInfoIT_EE Line | Count | Source | 2063 | 123 | std::string TestParamToString(const testing::TestParamInfo<T>& param_info) { | 2064 | 123 | return ToString(param_info.param); | 2065 | 123 | } |
tablet-split-itest.cc:_ZN2yb12_GLOBAL__N_117TestParamToStringINS_6client13kv_table_test12PartitioningEEENSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEERKN7testing13TestParamInfoIT_EE Line | Count | Source | 2063 | 82 | std::string TestParamToString(const testing::TestParamInfo<T>& param_info) { | 2064 | 82 | return ToString(param_info.param); | 2065 | 82 | } |
|
2066 | | |
2067 | | } // namespace |
2068 | | |
2069 | | INSTANTIATE_TEST_CASE_P( |
2070 | | TabletSplitITest, |
2071 | | TabletSplitITestWithIsolationLevel, |
2072 | | ::testing::ValuesIn(GetAllPbEnumValues<IsolationLevel>()), |
2073 | | TestParamToString<IsolationLevel>); |
2074 | | |
2075 | | INSTANTIATE_TEST_CASE_P( |
2076 | | TabletSplitSingleServerITest, |
2077 | | TabletSplitSystemRecordsITest, |
2078 | | ::testing::ValuesIn(List(static_cast<Partitioning*>(nullptr))), |
2079 | | TestParamToString<Partitioning>); |
2080 | | |
2081 | | } // namespace yb |