YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/rocksdb/db/corruption_test.cc
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under the BSD-style license found in the
3
//  LICENSE file in the root directory of this source tree. An additional grant
4
//  of patent rights can be found in the PATENTS file in the same directory.
5
//
6
// The following only applies to changes made to this file as part of YugaByte development.
7
//
8
// Portions Copyright (c) YugaByte, Inc.
9
//
10
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
11
// in compliance with the License.  You may obtain a copy of the License at
12
//
13
// http://www.apache.org/licenses/LICENSE-2.0
14
//
15
// Unless required by applicable law or agreed to in writing, software distributed under the License
16
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
17
// or implied.  See the License for the specific language governing permissions and limitations
18
// under the License.
19
//
20
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
21
// Use of this source code is governed by a BSD-style license that can be
22
// found in the LICENSE file. See the AUTHORS file for names of contributors.
23
24
#ifndef ROCKSDB_LITE
25
26
#include <inttypes.h>
27
#include <errno.h>
28
#include <fcntl.h>
29
#include <sys/stat.h>
30
#include <sys/types.h>
31
32
#include <gtest/gtest.h>
33
34
#include "yb/rocksdb/db.h"
35
36
#include "yb/rocksdb/cache.h"
37
#include "yb/rocksdb/env.h"
38
#include "yb/rocksdb/table.h"
39
#include "yb/rocksdb/write_batch.h"
40
#include "yb/rocksdb/db/db_impl.h"
41
#include "yb/rocksdb/db/filename.h"
42
#include "yb/rocksdb/db/log_format.h"
43
#include "yb/rocksdb/db/version_set.h"
44
#include "yb/rocksdb/util/logging.h"
45
#include "yb/rocksdb/util/testharness.h"
46
#include "yb/rocksdb/util/testutil.h"
47
48
#include "yb/util/test_macros.h"
49
50
namespace rocksdb {
51
52
static const int kValueSize = 1000;
53
54
class CorruptionTest : public RocksDBTest {
55
 public:
56
  test::ErrorEnv env_;
57
  std::string dbname_;
58
  shared_ptr<Cache> tiny_cache_;
59
  Options options_;
60
  DB* db_;
61
62
12
  CorruptionTest() {
63
12
    tiny_cache_ = NewLRUCache(100);
64
12
    options_.env = &env_;
65
12
    dbname_ = test::TmpDir() + "/corruption_test";
66
12
    CHECK_OK(DestroyDB(dbname_, options_));
67
68
12
    db_ = nullptr;
69
12
    options_.create_if_missing = true;
70
12
    BlockBasedTableOptions table_options;
71
12
    table_options.block_size_deviation = 0;  // make unit test pass for now
72
12
    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
73
12
    Reopen();
74
12
    options_.create_if_missing = false;
75
12
  }
76
77
12
  ~CorruptionTest() {
78
12
     delete db_;
79
12
     CHECK_OK(DestroyDB(dbname_, Options()));
80
12
  }
81
82
0
  void CloseDb() {
83
0
    delete db_;
84
0
    db_ = nullptr;
85
0
  }
86
87
32
  Status TryReopen(Options* options = nullptr) {
88
32
    delete db_;
89
32
    db_ = nullptr;
90
21
    Options opt = (options ? *options : options_);
91
32
    opt.env = &env_;
92
32
    opt.arena_block_size = 4096;
93
32
    BlockBasedTableOptions table_options;
94
32
    table_options.block_cache = tiny_cache_;
95
32
    table_options.block_size_deviation = 0;
96
32
    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
97
32
    return DB::Open(opt, dbname_, &db_);
98
32
  }
99
100
27
  void Reopen(Options* options = nullptr) {
101
27
    ASSERT_OK(TryReopen(options));
102
27
  }
103
104
3
  void RepairDB() {
105
3
    delete db_;
106
3
    db_ = nullptr;
107
3
    ASSERT_OK(::rocksdb::RepairDB(dbname_, options_));
108
3
  }
109
110
10
  void Build(int n, int flush_every = 0) {
111
10
    std::string key_space, value_space;
112
10
    WriteBatch batch;
113
21.2k
    for (int i = 0; i < n; i++) {
114
21.2k
      if (flush_every != 0 && i != 0 && i % flush_every == 0) {
115
1
        DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
116
1
        ASSERT_OK(dbi->TEST_FlushMemTable());
117
1
      }
118
      // if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
119
21.2k
      Slice key = Key(i, &key_space);
120
21.2k
      batch.Clear();
121
21.2k
      batch.Put(key, Value(i, &value_space));
122
21.2k
      ASSERT_OK(db_->Write(WriteOptions(), &batch));
123
21.2k
    }
124
10
  }
125
126
8
  void Check(int min_expected, int max_expected) {
127
8
    uint64_t next_expected = 0;
128
8
    uint64_t missed = 0;
129
8
    int bad_keys = 0;
130
8
    int bad_values = 0;
131
8
    int correct = 0;
132
8
    std::string value_space;
133
    // Do not verify checksums. If we verify checksums then the
134
    // db itself will raise errors because data is corrupted.
135
    // Instead, we want the reads to be successful and this test
136
    // will detect whether the appropriate corruptions have
137
    // occurred.
138
8
    Iterator* iter = db_->NewIterator(ReadOptions(false, true));
139
16.2k
    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
140
16.2k
      uint64_t key;
141
16.2k
      Slice in(iter->key());
142
16.2k
      if (!ConsumeDecimalNumber(&in, &key) ||
143
16.2k
          !in.empty() ||
144
16.2k
          key < next_expected) {
145
2
        bad_keys++;
146
2
        continue;
147
2
      }
148
16.2k
      missed += (key - next_expected);
149
16.2k
      next_expected = key + 1;
150
16.2k
      if (iter->value() != Value(static_cast<int>(key), &value_space)) {
151
3
        bad_values++;
152
16.2k
      } else {
153
16.2k
        correct++;
154
16.2k
      }
155
16.2k
    }
156
8
    delete iter;
157
158
8
    fprintf(stderr,
159
8
      "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%" PRIu64 "\n",
160
8
            min_expected, max_expected, correct, bad_keys, bad_values, missed);
161
8
    ASSERT_LE(min_expected, correct);
162
8
    ASSERT_GE(max_expected, correct);
163
8
  }
164
165
  // Corrupts specified number of bytes in SST starting at specified offset.
166
  // If SST is split into base and data files, then we treat offset as offset in composite data
167
  // space where data files go first and metadata file goes after data files.
168
  // This method doesn't support the case when area to be corrupted spans both base and data file.
169
  // We have assert to avoid such cases, since they are not required for tests as of 2017-03-09.
170
8
  void CorruptFile(const std::string& base_fname, int offset, int bytes_to_corrupt) {
171
8
    std::string fname;
172
173
8
    {
174
8
      struct stat base_sbuf;
175
8
      if (stat(base_fname.c_str(), &base_sbuf) != 0) {
176
0
        const char *msg = strerror(errno);
177
0
        ASSERT_TRUE(false) << base_fname << ": " << msg;
178
8
      }
179
180
8
      struct stat data_sbuf;
181
8
      const std::string data_fname = TableBaseToDataFileName(base_fname);
182
8
      const bool is_split_sst = stat(data_fname.c_str(), &data_sbuf) == 0;
183
5
      const auto total_size = base_sbuf.st_size + (is_split_sst ? data_sbuf.st_size : 0);
184
185
8
      if (offset < 0) {
186
        // Relative to end of file; make it absolute
187
1
        if (-offset > total_size) {
188
0
          offset = 0;
189
1
        } else {
190
1
          offset = static_cast<int>(total_size + offset);
191
1
        }
192
1
      }
193
8
      if (offset > total_size) {
194
0
        offset = static_cast<int>(total_size);
195
0
      }
196
8
      if (offset + bytes_to_corrupt > total_size) {
197
1
        bytes_to_corrupt = static_cast<int>(total_size - offset);
198
1
      }
199
200
8
      if (is_split_sst) {
201
5
        if (offset >= data_sbuf.st_size) {
202
          // Offset is beyond data file, we need to corrupt base file.
203
1
          offset -= data_sbuf.st_size;
204
1
          fname = base_fname;
205
4
        } else {
206
          // Ensure area to be corrupted does not span both data and base file.
207
4
          assert(offset + bytes_to_corrupt <= data_sbuf.st_size);
208
4
          fname = data_fname;
209
4
        }
210
3
      } else {
211
3
        fname = base_fname;
212
3
      }
213
8
    }
214
215
    // Do it
216
8
    std::string contents;
217
8
    Status s = ReadFileToString(Env::Default(), fname, &contents);
218
16
    ASSERT_TRUE(s.ok()) << s.ToString();
219
684
    for (int i = 0; i < bytes_to_corrupt; i++) {
220
676
      contents[i + offset] ^= 0x80;
221
676
    }
222
8
    s = WriteStringToFile(Env::Default(), contents, fname);
223
16
    ASSERT_TRUE(s.ok()) << s.ToString();
224
8
  }
225
226
7
  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
227
    // Pick file to corrupt
228
7
    std::vector<std::string> filenames;
229
7
    ASSERT_OK(env_.GetChildren(dbname_, &filenames));
230
7
    uint64_t number;
231
7
    FileType type;
232
7
    std::string fname;
233
7
    int picked_number = -1;
234
86
    for (size_t i = 0; i < filenames.size(); i++) {
235
79
      if (ParseFileName(filenames[i], &number, &type) &&
236
56
          type == filetype &&
237
8
          static_cast<int>(number) > picked_number) {  // Pick latest file
238
7
        fname = dbname_ + "/" + filenames[i];
239
7
        picked_number = static_cast<int>(number);
240
7
      }
241
79
    }
242
14
    ASSERT_TRUE(!fname.empty()) << filetype;
243
244
7
    CorruptFile(fname, offset, bytes_to_corrupt);
245
7
  }
246
247
  // corrupts exactly one file at level `level`. if no file found at level,
248
  // asserts
249
1
  void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
250
1
    std::vector<LiveFileMetaData> metadata;
251
1
    db_->GetLiveFilesMetaData(&metadata);
252
1
    for (const auto& m : metadata) {
253
1
      if (m.level == level) {
254
1
        CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt);
255
1
        return;
256
1
      }
257
1
    }
258
0
    ASSERT_TRUE(false) << "no file found at level";
259
0
  }
260
261
262
2
  int Property(const std::string& name) {
263
2
    std::string property;
264
2
    int result;
265
2
    if (db_->GetProperty(name, &property) &&
266
2
        sscanf(property.c_str(), "%d", &result) == 1) {
267
2
      return result;
268
0
    } else {
269
0
      return -1;
270
0
    }
271
2
  }
272
273
  // Return the ith key
274
31.2k
  Slice Key(int i, std::string* storage) {
275
31.2k
    char buf[100];
276
31.2k
    snprintf(buf, sizeof(buf), "%016d", i);
277
31.2k
    storage->assign(buf, strlen(buf));
278
31.2k
    return Slice(*storage);
279
31.2k
  }
280
281
  // Return the value to associate with the specified key
282
51.7k
  Slice Value(int k, std::string* storage) {
283
51.7k
    if (k == 0) {
284
      // Ugh.  Random seed of 0 used to produce no entropy.  This code
285
      // preserves the implementation that was in place when all of the
286
      // magic values in this file were picked.
287
18
      *storage = std::string(kValueSize, ' ');
288
18
      return Slice(*storage);
289
51.6k
    } else {
290
51.6k
      Random r(k);
291
51.6k
      return RandomString(&r, kValueSize, storage);
292
51.6k
    }
293
51.7k
  }
294
};
295
296
1
TEST_F(CorruptionTest, Recovery) {
297
1
  Build(100);
298
1
  Check(100, 100);
299
#ifdef OS_WIN
300
  // On Wndows OS Disk cache does not behave properly
301
  // We do not call FlushBuffers on every Flush. If we do not close
302
  // the log file prior to the corruption we end up with the first
303
  // block not corrupted but only the second. However, under the debugger
304
  // things work just fine but never pass when running normally
305
  // For that reason people may want to run with unbuffered I/O. That option
306
  // is not available for WAL though.
307
  CloseDb();
308
#endif
309
1
  Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
310
1
  Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
311
1
  ASSERT_TRUE(!TryReopen().ok());
312
1
  options_.paranoid_checks = false;
313
1
  Reopen(&options_);
314
315
  // The 64 records in the first two log blocks are completely lost.
316
1
  Check(36, 36);
317
1
}
318
319
1
TEST_F(CorruptionTest, RecoverWriteError) {
320
1
  env_.writable_file_error_ = true;
321
1
  Status s = TryReopen();
322
1
  ASSERT_TRUE(!s.ok());
323
1
}
324
325
1
TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
326
  // Do enough writing to force minor compaction
327
1
  env_.writable_file_error_ = true;
328
1
  const int num =
329
1
      static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
330
1
  std::string value_storage;
331
1
  Status s;
332
1
  bool failed = false;
333
4.19k
  for (int i = 0; i < num; i++) {
334
4.19k
    WriteBatch batch;
335
4.19k
    batch.Put("a", Value(100, &value_storage));
336
4.19k
    s = db_->Write(WriteOptions(), &batch);
337
4.19k
    if (!s.ok()) {
338
486
      failed = true;
339
486
    }
340
4.19k
    ASSERT_TRUE(!failed || !s.ok());
341
4.19k
  }
342
1
  ASSERT_TRUE(!s.ok());
343
1
  ASSERT_GE(env_.num_writable_file_errors_, 1);
344
1
  env_.writable_file_error_ = false;
345
1
  Reopen();
346
1
}
347
348
1
TEST_F(CorruptionTest, TableFile) {
349
1
  Build(100);
350
1
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
351
1
  ASSERT_OK(dbi->TEST_FlushMemTable());
352
1
  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
353
1
  ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
354
355
1
  Corrupt(kTableFile, 100, 1);
356
1
  Check(99, 99);
357
1
}
358
359
1
TEST_F(CorruptionTest, TableFileIndexData) {
360
1
  Options options;
361
  // very big, we'll trigger flushes manually
362
1
  options.write_buffer_size = 100 * 1024 * 1024;
363
1
  Reopen(&options);
364
  // build 2 tables, flush at 5000
365
1
  Build(10000, 5000);
366
1
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
367
1
  ASSERT_OK(dbi->TEST_FlushMemTable());
368
369
  // Corrupt top level index block of an entire file.
370
1
  Corrupt(kTableFile, -1000, 500);
371
1
  Reopen();
372
  // one full file should be readable, since only one was corrupted
373
  // the other file should be fully non-readable, since index was corrupted
374
1
  Check(5000, 5000);
375
1
}
376
377
1
TEST_F(CorruptionTest, MissingDescriptor) {
378
1
  Build(1000);
379
1
  RepairDB();
380
1
  Reopen();
381
1
  Check(1000, 1000);
382
1
}
383
384
1
TEST_F(CorruptionTest, SequenceNumberRecovery) {
385
1
  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
386
1
  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
387
1
  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
388
1
  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
389
1
  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
390
1
  RepairDB();
391
1
  Reopen();
392
1
  std::string v;
393
1
  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
394
1
  ASSERT_EQ("v5", v);
395
  // Write something.  If sequence number was not recovered properly,
396
  // it will be hidden by an earlier write.
397
1
  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
398
1
  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
399
1
  ASSERT_EQ("v6", v);
400
1
  Reopen();
401
1
  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
402
1
  ASSERT_EQ("v6", v);
403
1
}
404
405
1
TEST_F(CorruptionTest, CorruptedDescriptor) {
406
1
  ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
407
1
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
408
1
  ASSERT_OK(dbi->TEST_FlushMemTable());
409
1
  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
410
411
1
  Corrupt(kDescriptorFile, 0, 1000);
412
1
  Status s = TryReopen();
413
1
  ASSERT_TRUE(!s.ok());
414
415
1
  RepairDB();
416
1
  Reopen();
417
1
  std::string v;
418
1
  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
419
1
  ASSERT_EQ("hello", v);
420
1
}
421
422
1
TEST_F(CorruptionTest, CompactionInputError) {
423
1
  Options options;
424
1
  Reopen(&options);
425
1
  Build(10);
426
1
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
427
1
  ASSERT_OK(dbi->TEST_FlushMemTable());
428
1
  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
429
1
  ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
430
1
  ASSERT_EQ(1, Property("rocksdb.num-files-at-level2"));
431
432
1
  Corrupt(kTableFile, 100, 1);
433
1
  Check(9, 9);
434
435
  // Force compactions by writing lots of values
436
1
  Build(10000);
437
1
  Check(10000, 10000);
438
1
}
439
440
1
TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
441
1
  Options options;
442
1
  options.paranoid_checks = true;
443
1
  options.write_buffer_size = 131072;
444
1
  options.max_write_buffer_number = 2;
445
1
  Reopen(&options);
446
1
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
447
448
  // Fill levels >= 1
449
7
  for (int level = 1; level < dbi->NumberLevels(); level++) {
450
6
    ASSERT_OK(dbi->Put(WriteOptions(), "", "begin"));
451
6
    ASSERT_OK(dbi->Put(WriteOptions(), "~", "end"));
452
6
    ASSERT_OK(dbi->TEST_FlushMemTable());
453
27
    for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
454
21
         ++comp_level) {
455
21
      ASSERT_OK(dbi->TEST_CompactRange(comp_level, nullptr, nullptr));
456
21
    }
457
6
  }
458
459
1
  Reopen(&options);
460
461
1
  dbi = reinterpret_cast<DBImpl*>(db_);
462
1
  Build(10);
463
1
  ASSERT_OK(dbi->TEST_FlushMemTable());
464
1
  ASSERT_OK(dbi->TEST_WaitForCompact());
465
1
  ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
466
467
1
  CorruptTableFileAtLevel(0, 100, 1);
468
1
  Check(9, 9);
469
470
  // Write must eventually fail because of corrupted table
471
1
  Status s;
472
1
  std::string tmp1, tmp2;
473
1
  bool failed = false;
474
10.0k
  for (int i = 0; i < 10000; i++) {
475
10.0k
    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
476
10.0k
    if (!s.ok()) {
477
8.96k
      failed = true;
478
8.96k
    }
479
    // if one write failed, every subsequent write must fail, too
480
20.0k
    ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
481
10.0k
  }
482
2
  ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
483
1
}
484
485
1
TEST_F(CorruptionTest, UnrelatedKeys) {
486
1
  Build(10);
487
1
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
488
1
  ASSERT_OK(dbi->TEST_FlushMemTable());
489
1
  Corrupt(kTableFile, 100, 1);
490
491
1
  std::string tmp1, tmp2;
492
1
  ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
493
1
  std::string v;
494
1
  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
495
1
  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
496
1
  ASSERT_OK(dbi->TEST_FlushMemTable());
497
1
  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
498
1
  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
499
1
}
500
501
1
TEST_F(CorruptionTest, FileSystemStateCorrupted) {
502
3
  for (int iter = 0; iter < 2; ++iter) {
503
2
    Options options;
504
2
    options.paranoid_checks = true;
505
2
    options.create_if_missing = true;
506
2
    Reopen(&options);
507
2
    Build(10);
508
2
    ASSERT_OK(db_->Flush(FlushOptions()));
509
2
    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
510
2
    std::vector<LiveFileMetaData> metadata;
511
2
    dbi->GetLiveFilesMetaData(&metadata);
512
2
    ASSERT_GT(metadata.size(), size_t(0));
513
2
    std::string filename = dbname_ + metadata[0].name;
514
515
2
    delete db_;
516
2
    db_ = nullptr;
517
518
2
    if (iter == 0) {  // corrupt file size
519
1
      unique_ptr<WritableFile> file;
520
1
      ASSERT_OK(env_.NewWritableFile(filename, &file, EnvOptions()));
521
1
      ASSERT_OK(file->Append(Slice("corrupted sst")));
522
1
      file.reset();
523
1
    } else {  // delete the file
524
1
      ASSERT_OK(env_.DeleteFile(filename));
525
1
    }
526
527
2
    Status x = TryReopen(&options);
528
2
    ASSERT_TRUE(x.IsCorruption());
529
2
    ASSERT_OK(DestroyDB(dbname_, options_));
530
2
    Reopen(&options);
531
2
  }
532
1
}
533
534
}  // namespace rocksdb
535
536
13.2k
int main(int argc, char** argv) {
537
13.2k
  ::testing::InitGoogleTest(&argc, argv);
538
13.2k
  return RUN_ALL_TESTS();
539
13.2k
}
540
541
#else
542
#include <stdio.h>
543
544
int main(int argc, char** argv) {
545
  fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n");
546
  return 0;
547
}
548
549
#endif  // !ROCKSDB_LITE