/Users/deen/code/yugabyte-db/src/yb/docdb/randomized_docdb-test.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #include <algorithm> |
15 | | #include <utility> |
16 | | #include <vector> |
17 | | |
18 | | |
19 | | #include "yb/docdb/docdb.h" |
20 | | #include "yb/docdb/docdb_test_base.h" |
21 | | #include "yb/docdb/docdb_test_util.h" |
22 | | #include "yb/util/scope_exit.h" |
23 | | |
24 | | // Use a lower default number of tests when running on ASAN/TSAN so as not to exceed the test time |
25 | | // limit. |
26 | | #if defined(THREAD_SANITIZER) || defined(ADDRESS_SANITIZER) |
27 | | static constexpr int kDefaultTestNumIter = 2000; |
28 | | static constexpr int kDefaultSnapshotVerificationTestNumIter = 2000; |
29 | | #else |
30 | | static constexpr int kDefaultTestNumIter = 20000; |
31 | | static constexpr int kDefaultSnapshotVerificationTestNumIter = 15000; |
32 | | #endif |
33 | | |
34 | | DEFINE_int32(snapshot_verification_test_num_iter, kDefaultSnapshotVerificationTestNumIter, |
35 | | "Number iterations for randomized history cleanup DocDB tests."); |
36 | | |
37 | | DEFINE_int32(test_num_iter, kDefaultTestNumIter, |
38 | | "Number iterations for randomized DocDB tests, except those involving logical DocDB " |
39 | | "snapshots."); |
40 | | |
41 | | constexpr int kNumDocKeys = 50; |
42 | | constexpr int kNumUniqueSubKeys = 500; |
43 | | |
44 | | using std::vector; |
45 | | using std::pair; |
46 | | using std::sort; |
47 | | |
48 | | namespace yb { |
49 | | namespace docdb { |
50 | | |
51 | | namespace { |
52 | | |
53 | | void RemoveEntriesWithSecondComponentHigherThan(vector<pair<int, int>> *v, |
54 | 0 | int max_second_component) { |
55 | | // See https://en.wikipedia.org/wiki/Erase-remove_idiom. |
56 | 0 | v->erase( |
57 | 0 | std::remove_if(v->begin(), v->end(), [&](const pair<int, int>& p) { |
58 | 0 | return p.second > max_second_component; |
59 | 0 | }), |
60 | 0 | v->end()); |
61 | 0 | } |
62 | | |
63 | | } // anonymous namespace |
64 | | |
65 | | class RandomizedDocDBTest : public DocDBTestBase, |
66 | | public ::testing::WithParamInterface<ResolveIntentsDuringRead> { |
67 | | protected: |
68 | 0 | RandomizedDocDBTest() : verify_history_cleanup_(true) { |
69 | 0 | } |
70 | | |
71 | 0 | void Init(const UseHash use_hash) { |
72 | | // This test was created when this was the only supported init marker behavior. |
73 | 0 | SetInitMarkerBehavior(InitMarkerBehavior::kRequired); |
74 | 0 | if (load_gen_.get() != nullptr) { |
75 | 0 | ClearLogicalSnapshots(); |
76 | 0 | ASSERT_OK(DestroyRocksDB()); |
77 | 0 | ASSERT_OK(ReopenRocksDB()); |
78 | 0 | } |
79 | 0 | load_gen_.reset(new DocDBLoadGenerator(this, kNumDocKeys, kNumUniqueSubKeys, use_hash, |
80 | 0 | resolve_intents_)); |
81 | 0 | SeedRandom(); |
82 | 0 | } |
83 | | |
84 | 0 | ~RandomizedDocDBTest() override {} |
85 | | void RunWorkloadWithSnaphots(bool enable_history_cleanup); |
86 | | |
87 | 0 | int num_iterations_divider() { |
88 | | // Read path is slower when trying to resolve intents, so we reduce number of iterations in |
89 | | // order to respect the timeout. |
90 | 0 | return resolve_intents_ ? 2 : 1; |
91 | 0 | } |
92 | | |
93 | 0 | void CompactionWithCleanup(HybridTime cleanup_ht) { |
94 | 0 | const auto start_time = MonoTime::Now(); |
95 | 0 | ASSERT_NO_FATALS(FullyCompactHistoryBefore(cleanup_ht)); |
96 | 0 | const auto elapsed_time_ms = (MonoTime::Now() - start_time).ToMilliseconds(); |
97 | 0 | total_compaction_time_ms_ += elapsed_time_ms; |
98 | 0 | LOG(INFO) << "Compaction with cleanup_ht=" << cleanup_ht << " took " |
99 | 0 | << elapsed_time_ms << " ms, all compactions so far: " |
100 | 0 | << total_compaction_time_ms_ << " ms"; |
101 | 0 | } |
102 | | |
103 | | ResolveIntentsDuringRead resolve_intents_ = ResolveIntentsDuringRead::kTrue; |
104 | | bool verify_history_cleanup_; |
105 | | std::unique_ptr<DocDBLoadGenerator> load_gen_; |
106 | | int64_t total_compaction_time_ms_ = 0; |
107 | | }; |
108 | | |
109 | 0 | void RandomizedDocDBTest::RunWorkloadWithSnaphots(bool enable_history_cleanup) { |
110 | 0 | auto scope_exit = ScopeExit([this]() { |
111 | 0 | LOG(INFO) << "Total compaction time: " << total_compaction_time_ms_ << " ms"; |
112 | 0 | }); |
113 | | // We start doing snapshots every other iterations, but make it less frequent after a number of |
114 | | // iterations (kIterationToSwitchToInfrequentSnapshots to be precise, see the loop below). |
115 | 0 | int snapshot_frequency = 2; |
116 | 0 | int verification_frequency = 1; |
117 | |
|
118 | 0 | constexpr int kEventualSnapshotFrequency = 1000; |
119 | 0 | constexpr int kEventualVerificationFrequency = 250; |
120 | 0 | constexpr int kFlushFrequency = 100; |
121 | 0 | constexpr int kIterationToSwitchToInfrequentSnapshots = 300; |
122 | |
|
123 | 0 | constexpr int kHistoryCleanupChance = 500; |
124 | |
|
125 | 0 | vector<pair<int, int>> cleanup_ht_and_iteration; |
126 | |
|
127 | 0 | HybridTime max_history_cleanup_ht(0); |
128 | |
|
129 | 0 | const int kNumIter = FLAGS_snapshot_verification_test_num_iter / num_iterations_divider(); |
130 | |
|
131 | 0 | while (load_gen_->next_iteration() <= kNumIter) { |
132 | 0 | const int current_iteration = load_gen_->next_iteration(); |
133 | 0 | if (current_iteration == kIterationToSwitchToInfrequentSnapshots) { |
134 | | // This is where we make snapshot/verification less frequent so the test can actually finish. |
135 | 0 | snapshot_frequency = kEventualSnapshotFrequency; |
136 | 0 | verification_frequency = kEventualVerificationFrequency; |
137 | 0 | } |
138 | 0 | ASSERT_NO_FATALS(load_gen_->PerformOperation()) << "at iteration " << current_iteration; |
139 | 0 | if (current_iteration % kFlushFrequency == 0) { |
140 | 0 | ASSERT_OK(FlushRocksDbAndWait()); |
141 | 0 | } |
142 | 0 | if (current_iteration % snapshot_frequency == 0) { |
143 | 0 | load_gen_->CaptureDocDbSnapshot(); |
144 | 0 | } |
145 | 0 | if (current_iteration % verification_frequency == 0) { |
146 | 0 | ASSERT_NO_FATALS(load_gen_->VerifyRandomDocDbSnapshot()); |
147 | 0 | } |
148 | |
|
149 | 0 | if (enable_history_cleanup && load_gen_->NextRandomInt(kHistoryCleanupChance) == 0) { |
150 | | // Pick a random cleanup hybrid_time from 0 to the last operation hybrid_time inclusively. |
151 | 0 | const HybridTime cleanup_ht = HybridTime( |
152 | 0 | load_gen_->NextRandom() % (load_gen_->last_operation_ht().value() + 1)); |
153 | 0 | if (cleanup_ht.CompareTo(max_history_cleanup_ht) <= 0) { |
154 | | // We are performing cleanup at an old hybrid_time, and don't expect it to have any effect. |
155 | 0 | InMemDocDbState snapshot_before_cleanup; |
156 | 0 | snapshot_before_cleanup.CaptureAt(doc_db(), HybridTime::kMax); |
157 | 0 | ASSERT_NO_FATALS(CompactionWithCleanup(cleanup_ht)); |
158 | |
|
159 | 0 | InMemDocDbState snapshot_after_cleanup; |
160 | 0 | snapshot_after_cleanup.CaptureAt(doc_db(), HybridTime::kMax); |
161 | 0 | ASSERT_TRUE(snapshot_after_cleanup.EqualsAndLogDiff(snapshot_before_cleanup)); |
162 | 0 | } else { |
163 | 0 | max_history_cleanup_ht = cleanup_ht; |
164 | 0 | cleanup_ht_and_iteration.emplace_back(cleanup_ht.value(), |
165 | 0 | load_gen_->last_operation_ht().value()); |
166 | 0 | ASSERT_NO_FATALS(CompactionWithCleanup(cleanup_ht)); |
167 | | |
168 | | // We expect some snapshots at hybrid_times earlier than cleanup_ht to no longer be |
169 | | // recoverable. |
170 | 0 | ASSERT_NO_FATALS(load_gen_->CheckIfOldestSnapshotIsStillValid(cleanup_ht)); |
171 | |
|
172 | 0 | load_gen_->RemoveSnapshotsBefore(cleanup_ht); |
173 | | |
174 | | // Now that we're removed all snapshots that could have been affected by history cleanup, |
175 | | // we expect the oldest remaining snapshot to match the RocksDB-backed DocDB state. |
176 | 0 | ASSERT_NO_FATALS(load_gen_->VerifyOldestSnapshot()); |
177 | 0 | } |
178 | 0 | } |
179 | 0 | } |
180 | |
|
181 | 0 | LOG(INFO) << "Finished the primary part of the randomized DocDB test.\n" |
182 | 0 | << " enable_history_cleanup: " << enable_history_cleanup << "\n" |
183 | 0 | << " last_operation_ht: " << load_gen_->last_operation_ht() << "\n" |
184 | 0 | << " max_history_cleanup_ht: " << max_history_cleanup_ht.value(); |
185 | |
|
186 | 0 | if (!enable_history_cleanup || !verify_history_cleanup_) return; |
187 | | |
188 | 0 | if (FLAGS_snapshot_verification_test_num_iter > kDefaultSnapshotVerificationTestNumIter) { |
189 | 0 | LOG(WARNING) |
190 | 0 | << "Number of iterations specified for the history cleanup test is greater than " |
191 | 0 | << kDefaultSnapshotVerificationTestNumIter << ", and therefore this test is " |
192 | 0 | << "NOT CHECKING THAT OLD SNAPSHOTS ARE INVALIDATED BY HISTORY CLEANUP."; |
193 | 0 | return; |
194 | 0 | } |
195 | | |
196 | | // Verify that some old snapshots got invalidated by history cleanup at a higher hybrid_time. |
197 | | |
198 | | // First we verify that history cleanup is happening at expected times, so that we can validate |
199 | | // that the maximum history cleanup hybrid_time (max_history_cleanup_ht) is as expected. |
200 | | |
201 | | // An entry (t, i) here says that after iteration i there was a history cleanup with a history |
202 | | // cutoff hybrid_time of t. The iteration here corresponds one to one to the operation |
203 | | // hybrid_time. We always have t < i because we perform cleanup at a past hybrid_time, |
204 | | // not a future one. |
205 | | // cleanup_ht | iteration (last op. ts.) |
206 | | // |
207 | | // These numbers depend on DocDB load generator parameters (random seed, frequencies of various |
208 | | // events) and will need to be replaced in such cases. Ideally, we should come up with a way to |
209 | | // either re-generate those quickly, or not rely on hard-coded expected results for validation. |
210 | | // However, we do handle variations in the number of iterations here, up a certain limit. |
211 | 0 | vector<pair<int, int>> expected_cleanup_ht_and_iteration{{1, 85}, |
212 | 0 | {40, 121}, |
213 | 0 | {46, 255}, |
214 | 0 | {245, 484}, |
215 | 0 | {774, 2246}, |
216 | 0 | {2341, 3417}, |
217 | 0 | {2741, 5248}, |
218 | 0 | {4762, 5652}, |
219 | 0 | {5049, 6377}, |
220 | 0 | {6027, 7573}, |
221 | 0 | {8423, 9531}, |
222 | 0 | {8829, 10413}, |
223 | 0 | {10061, 10610}, |
224 | 0 | {13137, 13920}}; |
225 | | |
226 | | // Remove expected (cleanup_hybrid_time, iteration) entries that don't apply to our test run in |
227 | | // case we did fewer than 15000 iterations. |
228 | 0 | RemoveEntriesWithSecondComponentHigherThan( |
229 | 0 | &expected_cleanup_ht_and_iteration, |
230 | 0 | narrow_cast<int>(load_gen_->last_operation_ht().value())); |
231 | |
|
232 | 0 | ASSERT_FALSE(expected_cleanup_ht_and_iteration.empty()); |
233 | 0 | ASSERT_EQ(expected_cleanup_ht_and_iteration, cleanup_ht_and_iteration); |
234 | |
|
235 | 0 | if (kNumIter > 2000) { |
236 | 0 | ASSERT_GT(load_gen_->num_divergent_old_snapshot(), 0); |
237 | 0 | } else { |
238 | 0 | ASSERT_EQ(0, load_gen_->num_divergent_old_snapshot()); |
239 | 0 | } |
240 | | |
241 | | // Expected hybrid_times of snapshots invalidated by history cleanup, and actual history cutoff |
242 | | // hybrid_times at which that happened. This is deterministic, but highly dependent on the |
243 | | // parameters at the top of this test. |
244 | 0 | vector<pair<int, int>> expected_divergent_snapshot_and_cleanup_ht{ |
245 | 0 | {298, 774}, |
246 | 0 | {2000, 2341}, |
247 | 0 | {4000, 4762}, |
248 | 0 | {5000, 5049}, |
249 | 0 | {6000, 6027}, |
250 | 0 | {8000, 8423}, |
251 | 0 | {10000, 10061}, |
252 | 0 | {13000, 13137} |
253 | 0 | }; |
254 | | |
255 | | // Remove entries that don't apply to us because we did not get to do a cleanup at that |
256 | | // hybrid_time. |
257 | 0 | RemoveEntriesWithSecondComponentHigherThan(&expected_divergent_snapshot_and_cleanup_ht, |
258 | 0 | narrow_cast<int>(max_history_cleanup_ht.value())); |
259 | |
|
260 | 0 | ASSERT_EQ(expected_divergent_snapshot_and_cleanup_ht, |
261 | 0 | load_gen_->divergent_snapshot_ht_and_cleanup_ht()); |
262 | 0 | } |
263 | | |
264 | 0 | TEST_P(RandomizedDocDBTest, TestNoFlush) { |
265 | 0 | resolve_intents_ = GetParam(); |
266 | 0 | const int num_iter = FLAGS_test_num_iter / num_iterations_divider(); |
267 | 0 | for (auto use_hash : UseHash::kValues) { |
268 | 0 | Init(use_hash); |
269 | 0 | while (load_gen_->next_iteration() <= num_iter) { |
270 | 0 | ASSERT_NO_FATALS(load_gen_->PerformOperation()) << "at iteration " << |
271 | 0 | load_gen_->next_iteration(); |
272 | 0 | } |
273 | 0 | } |
274 | 0 | } |
275 | | |
276 | 0 | TEST_P(RandomizedDocDBTest, TestWithFlush) { |
277 | 0 | resolve_intents_ = GetParam(); |
278 | 0 | const int num_iter = FLAGS_test_num_iter / num_iterations_divider(); |
279 | 0 | for (auto use_hash : UseHash::kValues) { |
280 | 0 | Init(use_hash); |
281 | 0 | while (load_gen_->next_iteration() <= num_iter) { |
282 | 0 | ASSERT_NO_FATALS(load_gen_->PerformOperation()) << "at iteration " |
283 | 0 | << load_gen_->next_iteration(); |
284 | 0 | if (load_gen_->next_iteration() % 250 == 0) { |
285 | 0 | ASSERT_NO_FATALS(load_gen_->FlushRocksDB()); |
286 | 0 | } |
287 | 0 | } |
288 | 0 | } |
289 | 0 | } |
290 | | |
291 | 0 | TEST_P(RandomizedDocDBTest, Snapshots) { |
292 | 0 | resolve_intents_ = GetParam(); |
293 | 0 | for (auto use_hash : UseHash::kValues) { |
294 | 0 | Init(use_hash); |
295 | 0 | RunWorkloadWithSnaphots(/* enable_history_cleanup = */ false); |
296 | 0 | } |
297 | 0 | } |
298 | | |
299 | 0 | TEST_P(RandomizedDocDBTest, SnapshotsWithHistoryCleanup) { |
300 | 0 | resolve_intents_ = GetParam(); |
301 | 0 | for (auto use_hash : UseHash::kValues) { |
302 | 0 | Init(use_hash); |
303 | | // Don't verify history cleanup in case we use hashed components, since hardcoded expected |
304 | | // values doesn't work for that use case. |
305 | | // TODO: update expected values or find a better way to test it. |
306 | 0 | verify_history_cleanup_ = !use_hash; |
307 | 0 | RunWorkloadWithSnaphots(/* enable_history_cleanup = */ true); |
308 | 0 | } |
309 | 0 | } |
310 | | |
311 | | INSTANTIATE_TEST_CASE_P(bool, RandomizedDocDBTest, ::testing::Values( |
312 | | ResolveIntentsDuringRead::kFalse, ResolveIntentsDuringRead::kTrue)); |
313 | | |
314 | | // This is a bit different from SnapshotsWithHistoryCleanup. Here, we perform history cleanup within |
315 | | // DocDBLoadGenerator::PerformOperation itself, reading the document being modified both before |
316 | | // and after the history cleanup. |
317 | 0 | TEST_F(RandomizedDocDBTest, ImmediateHistoryCleanup) { |
318 | 0 | for (auto use_hash : UseHash::kValues) { |
319 | 0 | Init(use_hash); |
320 | 0 | while (load_gen_->next_iteration() <= FLAGS_test_num_iter) { |
321 | 0 | if (load_gen_->next_iteration() % 250 == 0) { |
322 | 0 | ASSERT_NO_FATALS(load_gen_->FlushRocksDB()); |
323 | 0 | ASSERT_NO_FATALS(load_gen_->PerformOperation(/* history_cleanup = */ true)); |
324 | 0 | } else { |
325 | 0 | ASSERT_NO_FATALS(load_gen_->PerformOperation()); |
326 | 0 | } |
327 | 0 | } |
328 | 0 | } |
329 | 0 | } |
330 | | |
331 | | } // namespace docdb |
332 | | } // namespace yb |