YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/rocksdb/util/io_posix.cc
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under the BSD-style license found in the
3
//  LICENSE file in the root directory of this source tree. An additional grant
4
//  of patent rights can be found in the PATENTS file in the same directory.
5
//
6
// The following only applies to changes made to this file as part of YugaByte development.
7
//
8
// Portions Copyright (c) YugaByte, Inc.
9
//
10
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
11
// in compliance with the License.  You may obtain a copy of the License at
12
//
13
// http://www.apache.org/licenses/LICENSE-2.0
14
//
15
// Unless required by applicable law or agreed to in writing, software distributed under the License
16
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
17
// or implied.  See the License for the specific language governing permissions and limitations
18
// under the License.
19
//
20
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
21
// Use of this source code is governed by a BSD-style license that can be
22
// found in the LICENSE file. See the AUTHORS file for names of contributors.
23
24
#ifdef ROCKSDB_LIB_IO_POSIX
25
26
#include "yb/rocksdb/util/io_posix.h"
27
#include <errno.h>
28
#include <fcntl.h>
29
#if defined(__linux__)
30
#include <linux/fs.h>
31
#endif
32
#include <stdio.h>
33
#include <stdlib.h>
34
#include <string.h>
35
#include <sys/ioctl.h>
36
#include <sys/mman.h>
37
#include <sys/stat.h>
38
#include <sys/types.h>
39
#ifdef __linux__
40
#include <sys/statfs.h>
41
#include <sys/syscall.h>
42
#endif
43
#include "yb/rocksdb/port/port.h"
44
#include "yb/rocksdb/util/coding.h"
45
#include "yb/rocksdb/util/posix_logger.h"
46
#include "yb/rocksdb/util/sync_point.h"
47
48
#include "yb/util/file_system_posix.h"
49
#include "yb/util/malloc.h"
50
#include "yb/util/result.h"
51
#include "yb/util/slice.h"
52
#include "yb/util/stats/iostats_context_imp.h"
53
#include "yb/util/status_log.h"
54
#include "yb/util/std_util.h"
55
#include "yb/util/string_util.h"
56
57
DECLARE_bool(never_fsync);
58
59
namespace rocksdb {
60
61
// A wrapper for fadvise, if the platform doesn't support fadvise,
62
// it will simply return Status::NotSupport.
63
0
int Fadvise(int fd, off_t offset, size_t len, int advice) {
64
#ifdef __linux__
65
  return posix_fadvise(fd, offset, len, advice);
66
#else
67
0
  return 0;  // simply do nothing.
68
0
#endif
69
0
}
70
71
/*
72
 * PosixMmapReadableFile
73
 *
74
 * mmap() based random-access
75
 */
76
// base[0,length-1] contains the mmapped contents of the file.
77
PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
78
                                             const std::string& fname,
79
                                             void* base, size_t length,
80
                                             const EnvOptions& options)
81
4.74k
    : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
82
4.74k
  fd_ = fd_ + 0;  // suppress the warning for used variables
83
4.74k
  assert(options.use_mmap_reads);
84
0
  assert(options.use_os_buffer);
85
4.74k
}
86
87
4.74k
PosixMmapReadableFile::~PosixMmapReadableFile() {
88
4.74k
  int ret = munmap(mmapped_region_, length_);
89
4.74k
  if (ret != 0) {
90
0
    fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
91
0
            mmapped_region_, length_);
92
0
  }
93
4.74k
}
94
95
Status PosixMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
96
32.6k
                                   uint8_t* scratch) const {
97
32.6k
  Status s;
98
32.6k
  if (offset > length_) {
99
0
    *result = Slice();
100
0
    return STATUS_IO_ERROR(filename_, EINVAL);
101
32.6k
  } else if (offset + n > length_) {
102
0
    n = static_cast<size_t>(length_ - offset);
103
0
  }
104
32.6k
  *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
105
32.6k
  return s;
106
32.6k
}
107
108
0
Status PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
109
0
#ifndef __linux__
110
0
  return Status::OK();
111
#else
112
  // free OS pages
113
  int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
114
  if (ret == 0) {
115
    return Status::OK();
116
  }
117
  return STATUS_IO_ERROR(filename_, errno);
118
#endif
119
0
}
120
121
0
yb::Result<uint64_t> PosixMmapReadableFile::Size() const {
122
0
  return length_;
123
0
}
124
125
0
yb::Result<uint64_t> PosixMmapReadableFile::INode() const {
126
0
  struct stat st;
127
0
  if (stat(filename_.c_str(), &st) != 0) {
128
0
    return STATUS_IO_ERROR(filename_, errno);
129
0
  } else {
130
0
    return st.st_ino;
131
0
  }
132
0
}
133
134
0
size_t PosixMmapReadableFile::memory_footprint() const {
135
0
  return malloc_usable_size(this) + filename_.capacity();
136
0
}
137
138
/*
139
 * PosixMmapFile
140
 *
141
 * We preallocate up to an extra megabyte and use memcpy to append new
142
 * data to the file.  This is safe since we either properly close the
143
 * file before reading from it, or for log files, the reading code
144
 * knows enough to skip zero suffixes.
145
 */
146
0
Status PosixMmapFile::UnmapCurrentRegion() {
147
0
  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
148
0
  if (base_ != nullptr) {
149
0
    int munmap_status = munmap(base_, limit_ - base_);
150
0
    if (munmap_status != 0) {
151
0
      return STATUS_IO_ERROR(filename_, munmap_status);
152
0
    }
153
0
    file_offset_ += limit_ - base_;
154
0
    base_ = nullptr;
155
0
    limit_ = nullptr;
156
0
    last_sync_ = nullptr;
157
0
    dst_ = nullptr;
158
159
    // Increase the amount we map the next time, but capped at 1MB
160
0
    if (map_size_ < (1 << 20)) {
161
0
      map_size_ *= 2;
162
0
    }
163
0
  }
164
0
  return Status::OK();
165
0
}
166
167
0
Status PosixMmapFile::MapNewRegion() {
168
#ifdef ROCKSDB_FALLOCATE_PRESENT
169
  assert(base_ == nullptr);
170
171
  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
172
  // we can't fallocate with FALLOC_FL_KEEP_SIZE here
173
  if (allow_fallocate_) {
174
    IOSTATS_TIMER_GUARD(allocate_nanos);
175
    int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
176
    if (alloc_status != 0) {
177
      // fallback to posix_fallocate
178
      alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
179
    }
180
    if (alloc_status != 0) {
181
      return STATUS(IOError, "Error allocating space to file : " + filename_ +
182
                             "Error : " + strerror(alloc_status));
183
    }
184
  }
185
186
  TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds);
187
  void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
188
                   file_offset_);
189
  if (ptr == MAP_FAILED) {
190
    return STATUS(IOError, "MMap failed on " + filename_);
191
  }
192
  TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds);
193
194
  base_ = reinterpret_cast<char*>(ptr);
195
  limit_ = base_ + map_size_;
196
  dst_ = base_;
197
  last_sync_ = base_;
198
  return Status::OK();
199
#else
200
0
  return STATUS(NotSupported, "This platform doesn't support fallocate()");
201
0
#endif
202
0
}
203
204
0
Status PosixMmapFile::Msync() {
205
0
  if (dst_ == last_sync_) {
206
0
    return Status::OK();
207
0
  }
208
  // Find the beginnings of the pages that contain the first and last
209
  // bytes to be synced.
210
0
  size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
211
0
  size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
212
0
  last_sync_ = dst_;
213
0
  TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds);
214
0
  if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
215
0
    return STATUS_IO_ERROR(filename_, errno);
216
0
  }
217
0
  return Status::OK();
218
0
}
219
220
0
Status PosixMmapFile::Truncate(uint64_t size) {
221
0
  return Status::OK();
222
0
}
223
224
PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
225
                             const EnvOptions& options)
226
    : filename_(fname),
227
      fd_(fd),
228
      page_size_(page_size),
229
      map_size_(Roundup(65536, page_size)),
230
      base_(nullptr),
231
      limit_(nullptr),
232
      dst_(nullptr),
233
      last_sync_(nullptr),
234
0
      file_offset_(0) {
235
#ifdef ROCKSDB_FALLOCATE_PRESENT
236
  allow_fallocate_ = options.allow_fallocate;
237
  fallocate_with_keep_size_ = options.fallocate_with_keep_size;
238
#endif
239
0
  assert((page_size & (page_size - 1)) == 0);
240
0
  assert(options.use_mmap_writes);
241
0
}
242
243
0
PosixMmapFile::~PosixMmapFile() {
244
0
  if (fd_ >= 0) {
245
0
    WARN_NOT_OK(PosixMmapFile::Close(), "Failed to close posix mmap file");
246
0
  }
247
0
}
248
249
0
Status PosixMmapFile::Append(const Slice& data) {
250
0
  const char* src = data.cdata();
251
0
  size_t left = data.size();
252
0
  while (left > 0) {
253
0
    assert(base_ <= dst_);
254
0
    assert(dst_ <= limit_);
255
0
    size_t avail = limit_ - dst_;
256
0
    if (avail == 0) {
257
0
      Status s = UnmapCurrentRegion();
258
0
      if (!s.ok()) {
259
0
        return s;
260
0
      }
261
0
      s = MapNewRegion();
262
0
      if (!s.ok()) {
263
0
        return s;
264
0
      }
265
0
      TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds);
266
0
    }
267
268
0
    size_t n = (left <= avail) ? left : avail;
269
0
    memcpy(dst_, src, n);
270
0
    dst_ += n;
271
0
    src += n;
272
0
    left -= n;
273
0
  }
274
0
  return Status::OK();
275
0
}
276
277
0
Status PosixMmapFile::Close() {
278
0
  Status s;
279
0
  size_t unused = limit_ - dst_;
280
281
0
  s = UnmapCurrentRegion();
282
0
  if (!s.ok()) {
283
0
    s = STATUS_IO_ERROR(filename_, errno);
284
0
  } else if (unused > 0) {
285
    // Trim the extra space at the end of the file
286
0
    if (ftruncate(fd_, file_offset_ - unused) < 0) {
287
0
      s = STATUS_IO_ERROR(filename_, errno);
288
0
    }
289
0
  }
290
291
0
  if (close(fd_) < 0) {
292
0
    if (s.ok()) {
293
0
      s = STATUS_IO_ERROR(filename_, errno);
294
0
    }
295
0
  }
296
297
0
  fd_ = -1;
298
0
  base_ = nullptr;
299
0
  limit_ = nullptr;
300
0
  return s;
301
0
}
302
303
0
Status PosixMmapFile::Flush() { return Status::OK(); }
304
305
0
Status PosixMmapFile::Sync() {
306
0
  if (FLAGS_never_fsync) {
307
0
    return Status::OK();
308
0
  }
309
0
  if (fdatasync(fd_) < 0) {
310
0
    return STATUS_IO_ERROR(filename_, errno);
311
0
  }
312
313
0
  return Msync();
314
0
}
315
316
/**
317
 * Flush data as well as metadata to stable storage.
318
 */
319
0
Status PosixMmapFile::Fsync() {
320
0
  if (FLAGS_never_fsync) {
321
0
    return Status::OK();
322
0
  }
323
0
  if (fsync(fd_) < 0) {
324
0
    return STATUS_IO_ERROR(filename_, errno);
325
0
  }
326
327
0
  return Msync();
328
0
}
329
330
/**
331
 * Get the size of valid data in the file. This will not match the
332
 * size that is returned from the filesystem because we use mmap
333
 * to extend file by map_size every time.
334
 */
335
0
uint64_t PosixMmapFile::GetFileSize() {
336
0
  size_t used = dst_ - base_;
337
0
  return file_offset_ + used;
338
0
}
339
340
0
Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
341
0
#ifndef __linux__
342
0
  return Status::OK();
343
#else
344
  // free OS pages
345
  int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
346
  if (ret == 0) {
347
    return Status::OK();
348
  }
349
  return STATUS_IO_ERROR(filename_, errno);
350
#endif
351
0
}
352
353
#ifdef ROCKSDB_FALLOCATE_PRESENT
354
Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) {
355
  assert(yb::std_util::cmp_less_equal(offset, std::numeric_limits<off_t>::max()));
356
  assert(yb::std_util::cmp_less_equal(len, std::numeric_limits<off_t>::max()));
357
  TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
358
  int alloc_status = 0;
359
  if (allow_fallocate_) {
360
    alloc_status = fallocate(
361
        fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
362
          static_cast<off_t>(offset), static_cast<off_t>(len));
363
  }
364
  if (alloc_status == 0) {
365
    return Status::OK();
366
  } else {
367
    return STATUS_IO_ERROR(filename_, errno);
368
  }
369
}
370
#endif
371
372
/*
373
 * PosixWritableFile
374
 *
375
 * Use posix write to write data to a file.
376
 */
377
PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
378
                                     const EnvOptions& options)
379
3.70M
    : filename_(fname), fd_(fd), filesize_(0) {
380
#ifdef ROCKSDB_FALLOCATE_PRESENT
381
  allow_fallocate_ = options.allow_fallocate;
382
  fallocate_with_keep_size_ = options.fallocate_with_keep_size;
383
#endif
384
3.70M
  assert(!options.use_mmap_writes);
385
3.70M
}
386
387
3.66M
PosixWritableFile::~PosixWritableFile() {
388
3.66M
  if (fd_ >= 0) {
389
1.13M
    WARN_NOT_OK(PosixWritableFile::Close(), "Failed to close posix writable file");
390
1.13M
  }
391
3.66M
}
392
393
35.8M
Status PosixWritableFile::Append(const Slice& data) {
394
35.8M
  const char* src = data.cdata();
395
35.8M
  size_t left = data.size();
396
71.7M
  while (left != 0) {
397
35.8M
    ssize_t done = write(fd_, src, left);
398
35.8M
    if (done < 0) {
399
0
      if (errno == EINTR) {
400
0
        continue;
401
0
      }
402
0
      return STATUS_IO_ERROR(filename_, errno);
403
0
    }
404
35.8M
    left -= done;
405
35.8M
    src += done;
406
35.8M
  }
407
35.8M
  filesize_ += data.size();
408
35.8M
  return Status::OK();
409
35.8M
}
410
411
1.25M
Status PosixWritableFile::Truncate(uint64_t size) {
412
1.25M
  return Status::OK();
413
1.25M
}
414
415
3.66M
Status PosixWritableFile::Close() {
416
3.66M
  Status s;
417
418
3.66M
  size_t block_size;
419
3.66M
  size_t last_allocated_block;
420
3.66M
  GetPreallocationStatus(&block_size, &last_allocated_block);
421
3.66M
  if (last_allocated_block > 0) {
422
    // trim the extra space preallocated at the end of the file
423
    // NOTE(ljin): we probably don't want to surface failure as an IOError,
424
    // but it will be nice to log these errors.
425
690k
    int dummy __attribute__((unused));
426
690k
    dummy = ftruncate(fd_, filesize_);
427
#ifdef ROCKSDB_FALLOCATE_PRESENT
428
    // in some file systems, ftruncate only trims trailing space if the
429
    // new file size is smaller than the current size. Calling fallocate
430
    // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
431
    // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
432
    // filesystems:
433
    //   XFS (since Linux 2.6.38)
434
    //   ext4 (since Linux 3.0)
435
    //   Btrfs (since Linux 3.7)
436
    //   tmpfs (since Linux 3.5)
437
    // We ignore error since failure of this operation does not affect
438
    // correctness.
439
    IOSTATS_TIMER_GUARD(allocate_nanos);
440
    if (allow_fallocate_) {
441
      fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
442
                block_size * last_allocated_block - filesize_);
443
    }
444
#endif
445
690k
  }
446
447
3.66M
  if (close(fd_) < 0) {
448
0
    s = STATUS_IO_ERROR(filename_, errno);
449
0
  }
450
3.66M
  fd_ = -1;
451
3.66M
  return s;
452
3.66M
}
453
454
// write out the cached data to the OS cache
455
25.7M
Status PosixWritableFile::Flush() { return Status::OK(); }
456
457
608k
Status PosixWritableFile::Sync() {
458
608k
  if (fdatasync(fd_) < 0) {
459
0
    return STATUS_IO_ERROR(filename_, errno);
460
0
  }
461
608k
  return Status::OK();
462
608k
}
463
464
14.2k
Status PosixWritableFile::Fsync() {
465
14.2k
  if (FLAGS_never_fsync) {
466
14.2k
    return Status::OK();
467
14.2k
  }
468
0
  if (fsync(fd_) < 0) {
469
0
    return STATUS_IO_ERROR(filename_, errno);
470
0
  }
471
0
  return Status::OK();
472
0
}
473
474
2
bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
475
476
99.6k
uint64_t PosixWritableFile::GetFileSize() { return filesize_; }
477
478
401
Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
479
401
#ifndef __linux__
480
401
  return Status::OK();
481
#else
482
  // free OS pages
483
  int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
484
  if (ret == 0) {
485
    return Status::OK();
486
  }
487
  return STATUS_IO_ERROR(filename_, errno);
488
#endif
489
401
}
490
491
#ifdef ROCKSDB_FALLOCATE_PRESENT
492
Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) {
493
  assert(yb::std_util::cmp_less_equal(offset, std::numeric_limits<off_t>::max()));
494
  assert(yb::std_util::cmp_less_equal(len, std::numeric_limits<off_t>::max()));
495
  TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
496
  IOSTATS_TIMER_GUARD(allocate_nanos);
497
  int alloc_status = 0;
498
  if (allow_fallocate_) {
499
    alloc_status = fallocate(
500
        fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
501
        static_cast<off_t>(offset), static_cast<off_t>(len));
502
  }
503
  if (alloc_status == 0) {
504
    return Status::OK();
505
  } else {
506
    return STATUS_IO_ERROR(filename_, errno);
507
  }
508
}
509
510
Status PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) {
511
  assert(yb::std_util::cmp_less_equal(offset, std::numeric_limits<off_t>::max()));
512
  assert(yb::std_util::cmp_less_equal(nbytes, std::numeric_limits<off_t>::max()));
513
  if (sync_file_range(fd_, static_cast<off_t>(offset),
514
      static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE) == 0) {
515
    return Status::OK();
516
  } else {
517
    return STATUS_IO_ERROR(filename_, errno);
518
  }
519
}
520
521
size_t PosixWritableFile::GetUniqueId(char* id) const {
522
  return yb::GetUniqueIdFromFile(fd_, pointer_cast<uint8_t*>(id));
523
}
524
#endif
525
526
398k
PosixDirectory::~PosixDirectory() { close(fd_); }
527
528
476k
Status PosixDirectory::Fsync() {
529
476k
  if (FLAGS_never_fsync) {
530
476k
    return Status::OK();
531
476k
  }
532
257
  if (fsync(fd_) == -1) {
533
0
    return STATUS_IO_ERROR("directory", errno);
534
0
  }
535
257
  return Status::OK();
536
257
}
537
}  // namespace rocksdb
538
#endif