YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/util/env_posix.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file. See the AUTHORS file for names of contributors.
4
//
5
// The following only applies to changes made to this file as part of YugaByte development.
6
//
7
// Portions Copyright (c) YugaByte, Inc.
8
//
9
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
10
// in compliance with the License.  You may obtain a copy of the License at
11
//
12
// http://www.apache.org/licenses/LICENSE-2.0
13
//
14
// Unless required by applicable law or agreed to in writing, software distributed under the License
15
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
16
// or implied.  See the License for the specific language governing permissions and limitations
17
// under the License.
18
//
19
20
#include <dirent.h>
21
#include <fcntl.h>
22
#include <fts.h>
23
#include <pthread.h>
24
#include <stdio.h>
25
#include <stdlib.h>
26
#include <string.h>
27
#include <sys/stat.h>
28
#include <sys/statvfs.h>
29
#include <sys/time.h>
30
#include <sys/types.h>
31
#include <sys/uio.h>
32
#include <time.h>
33
34
#include <set>
35
#include <vector>
36
37
#if defined(__APPLE__)
38
#include <mach-o/dyld.h>
39
#include <sys/sysctl.h>
40
#else
41
#include <linux/falloc.h>
42
#include <sys/sysinfo.h>
43
#endif  // defined(__APPLE__)
44
#include <sys/resource.h>
45
46
#include <glog/logging.h>
47
48
#include "yb/gutil/atomicops.h"
49
#include "yb/gutil/bind.h"
50
#include "yb/gutil/callback.h"
51
#include "yb/gutil/casts.h"
52
#include "yb/gutil/map-util.h"
53
#include "yb/gutil/strings/substitute.h"
54
55
#include "yb/util/alignment.h"
56
#include "yb/util/debug/trace_event.h"
57
#include "yb/util/env.h"
58
#include "yb/util/errno.h"
59
#include "yb/util/file_system_posix.h"
60
#include "yb/util/flag_tags.h"
61
#include "yb/util/format.h"
62
#include "yb/util/locks.h"
63
#include "yb/util/logging.h"
64
#include "yb/util/malloc.h"
65
#include "yb/util/monotime.h"
66
#include "yb/util/path_util.h"
67
#include "yb/util/result.h"
68
#include "yb/util/slice.h"
69
#include "yb/util/status.h"
70
#include "yb/util/status_format.h"
71
#include "yb/util/status_log.h"
72
#include "yb/util/stopwatch.h"
73
#include "yb/util/thread_restrictions.h"
74
75
// Copied from falloc.h. Useful for older kernels that lack support for
76
// hole punching; fallocate(2) will return EOPNOTSUPP.
77
#ifndef FALLOC_FL_KEEP_SIZE
78
#define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */
79
#endif
80
#ifndef FALLOC_FL_PUNCH_HOLE
81
#define FALLOC_FL_PUNCH_HOLE  0x02 /* de-allocates range */
82
#endif
83
84
// For platforms without fdatasync (like OS X)
85
#ifndef fdatasync
86
240
#define fdatasync fsync
87
#endif
88
89
// For platforms without unlocked_stdio (like OS X)
90
#ifndef fread_unlocked
91
#define fread_unlocked fread
92
#endif
93
94
// See KUDU-588 for details.
95
DEFINE_bool(writable_file_use_fsync, false,
96
            "Use fsync(2) instead of fdatasync(2) for synchronizing dirty "
97
            "data to disk.");
98
TAG_FLAG(writable_file_use_fsync, advanced);
99
100
#ifdef __APPLE__
101
// Never fsync on Mac OS X as we are getting many slow fsync errors in Jenkins and the fsync
102
// implementation is very different in production (on Linux) anyway.
103
#define FLAGS_never_fsync_default true
104
#else
105
#define FLAGS_never_fsync_default false
106
#endif
107
108
DEFINE_bool(never_fsync, FLAGS_never_fsync_default,
109
            "Never fsync() anything to disk. This is used by tests to speed up runtime and improve "
110
            "stability. This is very unsafe to use in production.");
111
112
TAG_FLAG(never_fsync, advanced);
113
TAG_FLAG(never_fsync, unsafe);
114
115
DEFINE_int32(o_direct_block_size_bytes, 4096,
116
             "Size of the block to use when flag durable_wal_write is set.");
117
TAG_FLAG(o_direct_block_size_bytes, advanced);
118
119
DEFINE_int32(o_direct_block_alignment_bytes, 4096,
120
             "Alignment (in bytes) for blocks used for O_DIRECT operations.");
121
TAG_FLAG(o_direct_block_alignment_bytes, advanced);
122
123
DEFINE_test_flag(bool, simulate_fs_without_fallocate, false,
124
    "If true, the system simulates a file system that doesn't support fallocate.");
125
126
DEFINE_test_flag(int64, simulate_free_space_bytes, -1,
127
    "If a non-negative value, GetFreeSpaceBytes will return the specified value.");
128
129
DECLARE_bool(never_fsync);
130
131
using namespace std::placeholders;
132
using base::subtle::Atomic64;
133
using base::subtle::Barrier_AtomicIncrement;
134
using std::vector;
135
using strings::Substitute;
136
137
static __thread uint64_t thread_local_id;
138
static Atomic64 cur_thread_local_id_;
139
140
namespace yb {
141
142
namespace {
143
144
#if defined(__APPLE__)
145
// Simulates Linux's fallocate file preallocation API on OS X.
146
97.0k
int fallocate(int fd, int mode, off_t offset, off_t len) {
147
97.0k
  CHECK_EQ(mode, 0);
148
97.0k
  off_t size = offset + len;
149
150
97.0k
  struct stat stat;
151
97.0k
  int ret = fstat(fd, &stat);
152
97.0k
  if (ret < 0) {
153
0
    return ret;
154
0
  }
155
156
97.1k
  if (stat.st_blocks * 512 < size) {
157
    // The offset field seems to have no effect; the file is always allocated
158
    // with space from 0 to the size. This is probably because OS X does not
159
    // support sparse files.
160
97.1k
    fstore_t store = {F_ALLOCATECONTIG, F_PEOFPOSMODE, 0, size};
161
97.1k
    if (fcntl(fd, F_PREALLOCATE, &store) < 0) {
162
0
      LOG(INFO) << "Unable to allocate contiguous disk space, attempting non-contiguous allocation";
163
0
      store.fst_flags = F_ALLOCATEALL;
164
0
      ret = fcntl(fd, F_PREALLOCATE, &store);
165
0
      if (ret < 0) {
166
0
        return ret;
167
0
      }
168
97.0k
    }
169
97.1k
  }
170
171
97.1k
  if (stat.st_size < size) {
172
    // fcntl does not change the file size, so set it if necessary.
173
97.1k
    return ftruncate(fd, size);
174
97.1k
  }
175
18.4E
  return 0;
176
18.4E
}
177
#endif
178
179
// Close file descriptor when object goes out of scope.
180
class ScopedFdCloser {
181
 public:
182
  explicit ScopedFdCloser(int fd)
183
1
    : fd_(fd) {
184
1
  }
185
186
1
  ~ScopedFdCloser() {
187
1
    ThreadRestrictions::AssertIOAllowed();
188
1
    int err = ::close(fd_);
189
1
    if (PREDICT_FALSE(err != 0)) {
190
0
      PLOG(WARNING) << "Failed to close fd " << fd_;
191
0
    }
192
1
  }
193
194
 private:
195
  int fd_;
196
};
197
198
#define STATUS_IO_ERROR(context, err_number) \
199
1.07M
    STATUS_FROM_ERRNO_SPECIAL_EIO_HANDLING(context, err_number)
200
201
1.47M
static Status DoSync(int fd, const string& filename) {
202
1.47M
  ThreadRestrictions::AssertIOAllowed();
203
1.47M
  if (FLAGS_never_fsync) {
204
1.47M
    return Status::OK();
205
1.47M
  }
206
240
  if (FLAGS_writable_file_use_fsync) {
207
0
    if (fsync(fd) < 0) {
208
0
      return STATUS_IO_ERROR(filename, errno);
209
0
    }
210
240
  } else {
211
240
    if (fdatasync(fd) < 0) {
212
0
      return STATUS_IO_ERROR(filename, errno);
213
0
    }
214
240
  }
215
240
  return Status::OK();
216
240
}
217
218
7.24k
static Status DoOpen(const string& filename, Env::CreateMode mode, int* fd, int extra_flags = 0) {
219
7.24k
  ThreadRestrictions::AssertIOAllowed();
220
7.24k
  int flags = O_RDWR;
221
7.24k
  switch (mode) {
222
7.23k
    case Env::CREATE_IF_NON_EXISTING_TRUNCATE:
223
7.23k
      flags |= O_CREAT | O_TRUNC;
224
7.23k
      break;
225
6
    case Env::CREATE_NON_EXISTING:
226
6
      flags |= O_CREAT | O_EXCL;
227
6
      break;
228
2
    case Env::OPEN_EXISTING:
229
2
      break;
230
0
    default:
231
0
      return STATUS(NotSupported, Substitute("Unknown create mode $0", mode));
232
7.24k
  }
233
234
7.24k
  const int f = open(filename.c_str(), flags | extra_flags, 0644);
235
7.24k
  if (f < 0) {
236
3
    return STATUS_IO_ERROR(filename, errno);
237
3
  }
238
7.24k
  *fd = f;
239
7.24k
  return Status::OK();
240
7.24k
}
241
242
template <class Extractor>
243
487k
Result<uint64_t> GetFileStat(const std::string& fname, const char* event, Extractor extractor) {
244
487k
  TRACE_EVENT1("io", event, "path", fname);
245
487k
  ThreadRestrictions::AssertIOAllowed();
246
487k
  struct stat sbuf;
247
487k
  if (stat(fname.c_str(), &sbuf) != 0) {
248
1
    return STATUS_IO_ERROR(fname, errno);
249
1
  }
250
487k
  return extractor(sbuf);
251
487k
}
env_posix.cc:_ZN2yb12_GLOBAL__N_111GetFileStatIZNS0_16PosixFileFactory11GetFileSizeERKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEEUlRK4statE_EENS_6ResultIyEESB_PKcT_
Line
Count
Source
243
482k
Result<uint64_t> GetFileStat(const std::string& fname, const char* event, Extractor extractor) {
244
482k
  TRACE_EVENT1("io", event, "path", fname);
245
482k
  ThreadRestrictions::AssertIOAllowed();
246
482k
  struct stat sbuf;
247
482k
  if (stat(fname.c_str(), &sbuf) != 0) {
248
0
    return STATUS_IO_ERROR(fname, errno);
249
0
  }
250
482k
  return extractor(sbuf);
251
482k
}
env_posix.cc:_ZN2yb12_GLOBAL__N_111GetFileStatIZNS0_8PosixEnv12GetFileINodeERKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEEUlRK4statE_EENS_6ResultIyEESB_PKcT_
Line
Count
Source
243
4.50k
Result<uint64_t> GetFileStat(const std::string& fname, const char* event, Extractor extractor) {
244
4.50k
  TRACE_EVENT1("io", event, "path", fname);
245
4.50k
  ThreadRestrictions::AssertIOAllowed();
246
4.50k
  struct stat sbuf;
247
4.50k
  if (stat(fname.c_str(), &sbuf) != 0) {
248
0
    return STATUS_IO_ERROR(fname, errno);
249
0
  }
250
4.50k
  return extractor(sbuf);
251
4.50k
}
Unexecuted instantiation: env_posix.cc:_ZN2yb12_GLOBAL__N_111GetFileStatIZNS0_8PosixEnv17GetFileSizeOnDiskERKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEEUlRK4statE_EENS_6ResultIyEESB_PKcT_
env_posix.cc:_ZN2yb12_GLOBAL__N_111GetFileStatIZNS0_8PosixEnv12GetBlockSizeERKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEEUlRK4statE_EENS_6ResultIyEESB_PKcT_
Line
Count
Source
243
3
Result<uint64_t> GetFileStat(const std::string& fname, const char* event, Extractor extractor) {
244
3
  TRACE_EVENT1("io", event, "path", fname);
245
3
  ThreadRestrictions::AssertIOAllowed();
246
3
  struct stat sbuf;
247
3
  if (stat(fname.c_str(), &sbuf) != 0) {
248
1
    return STATUS_IO_ERROR(fname, errno);
249
1
  }
250
2
  return extractor(sbuf);
251
2
}
252
253
38.3k
Result<struct statvfs> GetFilesystemStats(const std::string& path) {
254
38.3k
  struct statvfs stat;
255
38.3k
  auto ret = statvfs(path.c_str(), &stat);
256
38.3k
  if (ret != 0) {
257
13.3k
    if (errno == EACCES) {
258
0
      return STATUS_SUBSTITUTE(NotAuthorized,
259
0
          "Caller doesn't have the required permission on a component of the path $0",
260
0
          path);
261
13.3k
    } else if (errno == EIO) {
262
0
      return STATUS_SUBSTITUTE(IOError,
263
0
          "I/O error occurred while reading from '$0' filesystem",
264
0
          path);
265
13.3k
    } else if (errno == ELOOP) {
266
0
      return STATUS_SUBSTITUTE(InternalError,
267
0
          "Too many symbolic links while translating '$0' path",
268
0
          path);
269
13.3k
    } else if (errno == ENAMETOOLONG) {
270
0
      return STATUS_SUBSTITUTE(NotSupported,
271
0
          "Path '$0' is too long",
272
0
          path);
273
13.3k
    } else if (errno == ENOENT) {
274
13.3k
      return STATUS_SUBSTITUTE(NotFound,
275
13.3k
          "File specified by path '$0' doesn't exist",
276
13.3k
          path);
277
0
    } else if (errno == ENOMEM) {
278
0
      return STATUS(InternalError, "Insufficient memory");
279
0
    } else if (errno == ENOSYS) {
280
0
      return STATUS_SUBSTITUTE(NotSupported,
281
0
          "Filesystem for path '$0' doesn't support statvfs",
282
0
          path);
283
0
    } else if (errno == ENOTDIR) {
284
0
      return STATUS_SUBSTITUTE(InvalidArgument,
285
0
          "A component of the path '$0' is not a directory",
286
0
          path);
287
0
    } else {
288
0
      return STATUS_SUBSTITUTE(InternalError,
289
0
          "Failed to read information about filesystem for path '%s': errno=$0: $1",
290
0
          path,
291
0
          errno,
292
0
          ErrnoToString(errno));
293
0
    }
294
25.0k
  }
295
296
25.0k
  return stat;
297
25.0k
}
298
299
// Use non-memory mapped POSIX files to write data to a file.
300
//
301
// TODO (perf) investigate zeroing a pre-allocated allocated area in
302
// order to further improve Sync() performance.
303
class PosixWritableFile : public WritableFile {
304
 public:
305
  PosixWritableFile(const std::string& fname, int fd, uint64_t file_size,
306
                    bool sync_on_close)
307
      : filename_(fname),
308
        fd_(fd),
309
        sync_on_close_(sync_on_close),
310
        filesize_(file_size),
311
        pre_allocated_size_(0),
312
1.23M
        pending_sync_(false) {}
313
314
1.19M
  ~PosixWritableFile() {
315
1.19M
    if (fd_ >= 0) {
316
5.91k
      WARN_NOT_OK(Close(), "Failed to close " + filename_);
317
5.91k
    }
318
1.19M
  }
319
320
2.42M
  Status Append(const Slice& data) override {
321
2.42M
    vector<Slice> data_vector;
322
2.42M
    data_vector.push_back(data);
323
2.42M
    return AppendVector(data_vector);
324
2.42M
  }
325
326
16.3M
  Status AppendSlices(const Slice* slices, size_t num) override {
327
16.3M
    ThreadRestrictions::AssertIOAllowed();
328
16.3M
    static const size_t kIovMaxElements = IOV_MAX;
329
330
16.3M
    Status s;
331
32.6M
    for (size_t i = 0; i < num && s.ok(); i += kIovMaxElements) {
332
16.3M
      size_t n = std::min(num - i, kIovMaxElements);
333
16.3M
      s = DoWritev(slices + i, n);
334
16.3M
    }
335
336
16.3M
    pending_sync_ = true;
337
16.3M
    return s;
338
16.3M
  }
339
340
97.1k
  Status PreAllocate(uint64_t size) override {
341
97.1k
    TRACE_EVENT1("io", "PosixWritableFile::PreAllocate", "path", filename_);
342
97.1k
    ThreadRestrictions::AssertIOAllowed();
343
97.1k
    uint64_t offset = std::max(filesize_, pre_allocated_size_);
344
97.1k
    if (PREDICT_FALSE(FLAGS_TEST_simulate_fs_without_fallocate)) {
345
0
      YB_LOG_FIRST_N(WARNING, 1) << "Simulating a filesystem without fallocate() support";
346
0
      return Status::OK();
347
0
    }
348
97.1k
    if (fallocate(fd_, 0, offset, size) < 0) {
349
0
      if (errno == EOPNOTSUPP) {
350
0
        YB_LOG_FIRST_N(WARNING, 1) << "The filesystem does not support fallocate().";
351
0
      } else if (errno == ENOSYS) {
352
0
        YB_LOG_FIRST_N(WARNING, 1) << "The kernel does not implement fallocate().";
353
0
      } else {
354
0
        return STATUS_IO_ERROR(filename_, errno);
355
0
      }
356
      // We don't want to modify pre_allocated_size_ since nothing was pre-allocated.
357
0
      return Status::OK();
358
0
    }
359
97.1k
    pre_allocated_size_ = offset + size;
360
97.1k
    return Status::OK();
361
97.1k
  }
362
363
1.19M
  Status Close() override {
364
1.19M
    TRACE_EVENT1("io", "PosixWritableFile::Close", "path", filename_);
365
1.19M
    ThreadRestrictions::AssertIOAllowed();
366
1.19M
    Status s;
367
368
    // If we've allocated more space than we used, truncate to the
369
    // actual size of the file and perform Sync().
370
1.19M
    if (filesize_ < pre_allocated_size_) {
371
54.8k
      if (ftruncate(fd_, filesize_) < 0) {
372
0
        s = STATUS_IO_ERROR(filename_, errno);
373
0
        pending_sync_ = true;
374
0
      }
375
54.8k
    }
376
377
1.19M
    if (sync_on_close_) {
378
61.8k
      Status sync_status = Sync();
379
61.8k
      if (!sync_status.ok()) {
380
0
        LOG(ERROR) << "Unable to Sync " << filename_ << ": " << sync_status.ToString();
381
0
        if (s.ok()) {
382
0
          s = sync_status;
383
0
        }
384
0
      }
385
61.8k
    }
386
387
1.19M
    if (close(fd_) < 0) {
388
0
      if (s.ok()) {
389
0
        s = STATUS_IO_ERROR(filename_, errno);
390
0
      }
391
0
    }
392
393
1.19M
    fd_ = -1;
394
1.19M
    return s;
395
1.19M
  }
396
397
1
  Status Flush(FlushMode mode) override {
398
1
    TRACE_EVENT1("io", "PosixWritableFile::Flush", "path", filename_);
399
1
    ThreadRestrictions::AssertIOAllowed();
400
1
    if (FLAGS_never_fsync) {
401
1
      return Status::OK();
402
1
    }
403
#if defined(__linux__)
404
    int flags = SYNC_FILE_RANGE_WRITE;
405
    if (mode == FLUSH_SYNC) {
406
      flags |= SYNC_FILE_RANGE_WAIT_AFTER;
407
    }
408
    if (sync_file_range(fd_, 0, 0, flags) < 0) {
409
      return STATUS_IO_ERROR(filename_, errno);
410
    }
411
#else
412
0
    if (fsync(fd_) < 0) {
413
0
      return STATUS_IO_ERROR(filename_, errno);
414
0
    }
415
0
#endif
416
0
    return Status::OK();
417
0
  }
418
419
1.47M
  Status Sync() override {
420
1.47M
    TRACE_EVENT1("io", "PosixWritableFile::Sync", "path", filename_);
421
1.47M
    ThreadRestrictions::AssertIOAllowed();
422
1.47M
    LOG_SLOW_EXECUTION(WARNING, 1000, Substitute("sync call for $0", filename_)) {
423
1.47M
      if (pending_sync_) {
424
1.47M
        pending_sync_ = false;
425
1.47M
        RETURN_NOT_OK(DoSync(fd_, filename_));
426
1.47M
      }
427
1.47M
    }
428
1.47M
    return Status::OK();
429
1.47M
  }
430
431
13.9M
  uint64_t Size() const override {
432
13.9M
    return filesize_;
433
13.9M
  }
434
435
11
  const string& filename() const override { return filename_; }
436
437
 protected:
438
    const std::string filename_;
439
    int fd_;
440
    bool sync_on_close_;
441
    uint64_t filesize_;
442
    uint64_t pre_allocated_size_;
443
    bool pending_sync_;
444
445
 private:
446
16.3M
  Status DoWritev(const Slice* slices, size_t n) {
447
16.3M
    ThreadRestrictions::AssertIOAllowed();
448
16.3M
    DCHECK_LE(n, IOV_MAX);
449
450
16.3M
    struct iovec iov[n];
451
16.3M
    ssize_t nbytes = 0;
452
453
46.5M
    for (size_t i = 0; i < n; ++i) {
454
30.2M
      const Slice& data = slices[i];
455
30.2M
      iov[i].iov_base = const_cast<uint8_t*>(data.data());
456
30.2M
      iov[i].iov_len = data.size();
457
30.2M
      nbytes += data.size();
458
30.2M
    }
459
460
16.3M
    ssize_t written = writev(fd_, iov, narrow_cast<int>(n));
461
462
16.3M
    if (PREDICT_FALSE(written == -1)) {
463
0
      int err = errno;
464
0
      return STATUS_IO_ERROR(filename_, err);
465
0
    }
466
467
16.3M
    filesize_ += written;
468
469
16.3M
    if (PREDICT_FALSE(written != nbytes)) {
470
0
      return STATUS_FORMAT(
471
0
          IOError, "writev error: expected to write $0 bytes, wrote $1 bytes instead",
472
0
          nbytes, written);
473
0
    }
474
475
16.3M
    return Status::OK();
476
16.3M
  }
477
};
478
479
#if defined(__linux__)
480
class PosixDirectIOWritableFile final : public PosixWritableFile {
481
 public:
482
  PosixDirectIOWritableFile(const std::string &fname, int fd, uint64_t file_size,
483
                            bool sync_on_close)
484
      : PosixWritableFile(fname, fd, file_size, false /* sync_on_close */) {
485
486
    if (file_size != 0) {
487
      // For now, we don't support appending to an already existing file (of non-zero size).
488
      // TODO(hector): If file size is not a multiple of block_size_, read the
489
      // last aligned block.
490
      LOG(FATAL) << "file size != 0";
491
    }
492
493
    next_write_offset_ = 0;
494
    last_block_used_bytes_ = 0;
495
    block_size_ = FLAGS_o_direct_block_size_bytes;
496
    last_block_idx_ = 0;
497
    has_new_data_ = false;
498
    real_size_ = 0;
499
    CHECK_GE(block_size_, 512);
500
  }
501
502
  ~PosixDirectIOWritableFile() {
503
    if (fd_ >= 0) {
504
      WARN_NOT_OK(Close(), "Failed to close " + filename_);
505
    }
506
  }
507
508
  Status Append(const Slice &const_data_slice) override {
509
    ThreadRestrictions::AssertIOAllowed();
510
    Slice data_slice = const_data_slice;
511
512
    while (data_slice.size() > 0) {
513
      size_t max_data = IOV_MAX * block_size_ - BufferedByteCount();
514
      CHECK_GT(IOV_MAX, 0);
515
      CHECK_GT(IOV_MAX * block_size_, BufferedByteCount());
516
      CHECK_GT(max_data, 0);
517
      const auto data = Slice(data_slice.data(), std::min(data_slice.size(), max_data));
518
519
      RETURN_NOT_OK(MaybeAllocateMemory(data.size()));
520
      RETURN_NOT_OK(WriteToBuffer(data));
521
522
      if (data_slice.size() >= max_data) {
523
        data_slice.remove_prefix(max_data);
524
        RETURN_NOT_OK(Sync());
525
      } else {
526
        break;
527
      }
528
    }
529
    real_size_ += const_data_slice.size();
530
    return Status::OK();
531
  }
532
533
  Status AppendSlices(const Slice* slices, size_t num) override {
534
    ThreadRestrictions::AssertIOAllowed();
535
    for (auto end = slices + num; slices != end; ++slices) {
536
      RETURN_NOT_OK(Append(*slices));
537
    }
538
    return Status::OK();
539
  }
540
541
  Status Close() override {
542
    TRACE_EVENT1("io", "PosixDirectIOWritableFile::Close", "path", filename_);
543
    ThreadRestrictions::AssertIOAllowed();
544
    RETURN_NOT_OK(Sync());
545
    LOG(INFO) << "Closing file " << filename_ << " with " << block_ptr_vec_.size() << " blocks";
546
    off_t fsize;
547
    fsize = lseek(fd_, 0, SEEK_END);
548
    CHECK_EQ(fsize, std::max(filesize_, pre_allocated_size_));
549
550
    if (real_size_ < filesize_ || real_size_ < pre_allocated_size_) {
551
      LOG(INFO) << filename_ << ": Truncating file from size: " << filesize_
552
                << " to size: " << real_size_
553
                << ". Preallocated size: " << pre_allocated_size_;
554
      if (ftruncate(fd_, real_size_) != 0) {
555
        return STATUS_IO_ERROR(filename_, errno);
556
      }
557
    }
558
559
    if (close(fd_) < 0) {
560
      return STATUS_IO_ERROR(filename_, errno);
561
    }
562
563
    fd_ = -1;
564
    return Status::OK();
565
  }
566
567
  Status Flush(FlushMode mode) override {
568
    ThreadRestrictions::AssertIOAllowed();
569
    return Sync();
570
  }
571
572
  Status Sync() override {
573
    ThreadRestrictions::AssertIOAllowed();
574
    return DoWrite();
575
  }
576
577
  uint64_t Size() const override {
578
    return real_size_;
579
  }
580
581
 private:
582
  // The number of bytes buffered so far (that will be written out as part of the next write).
583
  size_t BufferedByteCount() {
584
    return last_block_idx_ * block_size_ + last_block_used_bytes_;
585
  }
586
587
  Status WriteToBuffer(Slice data) {
588
    auto last_block_used_bytes = last_block_used_bytes_;
589
    auto last_block_idx = last_block_idx_;
590
    auto total_bytes_cached = BufferedByteCount();
591
    auto request_size = data.size();
592
593
    CHECK_GT(data.size(), 0);
594
    // Used only for the first block. Reset to 0 after the first memcpy.
595
    size_t block_offset = last_block_used_bytes_;
596
    auto i = last_block_idx_;
597
    if (last_block_used_bytes_ == block_size_) {
598
      // Start writing in a new block if the last block is full.
599
      i++;
600
      block_offset = 0;
601
    }
602
    while (data.size() > 0) {
603
      last_block_used_bytes_ = block_size_;
604
      size_t block_data_size = std::min(data.size(), block_size_ - block_offset);
605
      if (block_data_size + block_offset < block_size_) {
606
        // Writing the last block.
607
        last_block_used_bytes_ = block_data_size + block_offset;
608
        memset(&block_ptr_vec_[i].get()[last_block_used_bytes_], 0,
609
               block_size_ - last_block_used_bytes_);
610
      }
611
      CHECK(i < block_ptr_vec_.size());
612
      memcpy(&block_ptr_vec_[i].get()[block_offset], data.data(), block_data_size);
613
      block_offset = 0;
614
      data.remove_prefix(block_data_size);
615
      last_block_idx_ = i++;
616
      has_new_data_ = true;
617
    }
618
619
    CHECK_GE(last_block_idx_, last_block_idx);
620
    if (last_block_idx_ == last_block_idx) {
621
      CHECK_GE(last_block_used_bytes_, last_block_used_bytes);
622
    }
623
    CHECK_EQ(BufferedByteCount(), total_bytes_cached + request_size);
624
625
    return Status::OK();
626
  }
627
628
  Status DoWrite() {
629
    if (!has_new_data_) {
630
      return Status::OK();
631
    }
632
    CHECK_LE(last_block_used_bytes_, block_size_);
633
    CHECK_LT(last_block_idx_, block_ptr_vec_.size());
634
    auto blocks_to_write = last_block_idx_ + 1;
635
    CHECK_LE(blocks_to_write, IOV_MAX);
636
637
    struct iovec iov[blocks_to_write];
638
    for (size_t j = 0; j < blocks_to_write; j++) {
639
      iov[j].iov_base = block_ptr_vec_[j].get();
640
      iov[j].iov_len = block_size_;
641
    }
642
    ssize_t bytes_to_write = blocks_to_write * block_size_;
643
    ssize_t written = pwritev(fd_, iov, narrow_cast<int>(blocks_to_write), next_write_offset_);
644
645
    if (PREDICT_FALSE(written == -1)) {
646
      int err = errno;
647
      return STATUS_IO_ERROR(filename_, err);
648
    }
649
650
    if (PREDICT_FALSE(written != bytes_to_write)) {
651
      return STATUS(IOError,
652
                    Substitute("pwritev error: expected to write $0 bytes, wrote $1 bytes instead",
653
                               bytes_to_write, written));
654
    }
655
656
    filesize_ = next_write_offset_ + written;
657
    CHECK_EQ(filesize_, align_up(filesize_, block_size_));
658
659
    next_write_offset_ = filesize_;
660
661
    if (last_block_used_bytes_ != block_size_) {
662
      // Next write will happen at filesize_ - block_size_ offset in the file if the last block is
663
      // not full.
664
      next_write_offset_ -= block_size_;
665
666
      // Since the last block is only partially full, make it the first block so we can append to
667
      // it in the next call.
668
      if (last_block_idx_ > 0) {
669
        std::swap(block_ptr_vec_[0], block_ptr_vec_[last_block_idx_]);
670
      }
671
672
    } else {
673
      last_block_used_bytes_ = 0;
674
    }
675
    last_block_idx_ = 0;
676
    has_new_data_ = false;
677
    return Status::OK();
678
  }
679
680
  Status MaybeAllocateMemory(size_t data_size) {
681
    auto buffered_data_size = last_block_idx_ * block_size_ + last_block_used_bytes_;
682
    auto bytes_to_write = align_up(buffered_data_size + data_size, block_size_);
683
    auto blocks_to_write = bytes_to_write / block_size_;
684
685
    if (blocks_to_write > block_ptr_vec_.size()) {
686
      auto nblocks = blocks_to_write - block_ptr_vec_.size();
687
      for (size_t i = 0; i < nblocks; i++) {
688
        void *temp_buf = nullptr;
689
        auto err = posix_memalign(&temp_buf, FLAGS_o_direct_block_alignment_bytes, block_size_);
690
        if (err) {
691
          return STATUS(RuntimeError, "Unable to allocate memory", Errno(err));
692
        }
693
694
        uint8_t *start = static_cast<uint8_t *>(temp_buf);
695
        block_ptr_vec_.push_back(std::shared_ptr<uint8_t>(start, [](uint8_t *p) { free(p); }));
696
      }
697
698
      CHECK_EQ(block_ptr_vec_.size() * block_size_, bytes_to_write);
699
    }
700
    return Status::OK();
701
  }
702
703
  size_t next_write_offset_;
704
  vector<std::shared_ptr<uint8_t>> block_ptr_vec_;
705
  size_t last_block_used_bytes_;
706
  size_t last_block_idx_;
707
  size_t block_size_;
708
  bool has_new_data_;
709
  size_t real_size_;
710
};
711
#endif
712
713
class PosixRWFile final : public RWFile {
714
// is not employed.
715
 public:
716
  PosixRWFile(string fname, int fd, bool sync_on_close)
717
      : filename_(std::move(fname)),
718
        fd_(fd),
719
        sync_on_close_(sync_on_close),
720
2
        pending_sync_(false) {}
721
722
2
  ~PosixRWFile() {
723
2
    if (fd_ >= 0) {
724
      // Virtual method call in destructor.
725
2
      WARN_NOT_OK(Close(), "Failed to close " + filename_);
726
2
    }
727
2
  }
728
729
  virtual Status Read(uint64_t offset, size_t length,
730
3
                      Slice* result, uint8_t* scratch) const override {
731
3
    ThreadRestrictions::AssertIOAllowed();
732
3
    auto rem = length;
733
3
    uint8_t* dst = scratch;
734
6
    while (rem > 0) {
735
3
      ssize_t r = pread(fd_, dst, rem, offset);
736
3
      if (r < 0) {
737
        // An error: return a non-ok status.
738
0
        return STATUS_IO_ERROR(filename_, errno);
739
0
      }
740
3
      Slice this_result(dst, r);
741
3
      DCHECK_LE(this_result.size(), rem);
742
3
      if (this_result.size() == 0) {
743
        // EOF
744
0
        return STATUS_FORMAT(IOError, "EOF trying to read $0 bytes at offset $1", length, offset);
745
0
      }
746
3
      dst += this_result.size();
747
3
      rem -= this_result.size();
748
3
      offset += this_result.size();
749
3
    }
750
3
    DCHECK_EQ(0, rem);
751
3
    *result = Slice(scratch, length);
752
3
    return Status::OK();
753
3
  }
754
755
4
  Status Write(uint64_t offset, const Slice& data) override {
756
4
    ThreadRestrictions::AssertIOAllowed();
757
4
    ssize_t written = pwrite(fd_, data.data(), data.size(), offset);
758
759
4
    if (PREDICT_FALSE(written == -1)) {
760
0
      int err = errno;
761
0
      return STATUS_IO_ERROR(filename_, err);
762
0
    }
763
764
4
    if (PREDICT_FALSE(written != implicit_cast<ssize_t>(data.size()))) {
765
0
      return STATUS(IOError,
766
0
          Substitute("pwrite error: expected to write $0 bytes, wrote $1 bytes instead",
767
0
                     data.size(), written));
768
0
    }
769
770
4
    pending_sync_ = true;
771
4
    return Status::OK();
772
4
  }
773
774
0
  Status PreAllocate(uint64_t offset, size_t length) override {
775
0
    TRACE_EVENT1("io", "PosixRWFile::PreAllocate", "path", filename_);
776
0
    ThreadRestrictions::AssertIOAllowed();
777
0
    if (fallocate(fd_, 0, offset, length) < 0) {
778
0
      if (errno == EOPNOTSUPP) {
779
0
        YB_LOG_FIRST_N(WARNING, 1) << "The filesystem does not support fallocate().";
780
0
      } else if (errno == ENOSYS) {
781
0
        YB_LOG_FIRST_N(WARNING, 1) << "The kernel does not implement fallocate().";
782
0
      } else {
783
0
        return STATUS_IO_ERROR(filename_, errno);
784
0
      }
785
0
    }
786
0
    return Status::OK();
787
0
  }
788
789
0
  Status PunchHole(uint64_t offset, size_t length) override {
790
#if defined(__linux__)
791
    TRACE_EVENT1("io", "PosixRWFile::PunchHole", "path", filename_);
792
    ThreadRestrictions::AssertIOAllowed();
793
    if (fallocate(fd_, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length) < 0) {
794
      return STATUS_IO_ERROR(filename_, errno);
795
    }
796
    return Status::OK();
797
#else
798
0
    return STATUS(NotSupported, "Hole punching not supported on this platform");
799
0
#endif
800
0
  }
801
802
0
  Status Flush(FlushMode mode, uint64_t offset, size_t length) override {
803
0
    TRACE_EVENT1("io", "PosixRWFile::Flush", "path", filename_);
804
0
    ThreadRestrictions::AssertIOAllowed();
805
0
    if (FLAGS_never_fsync) {
806
0
      return Status::OK();
807
0
    }
808
#if defined(__linux__)
809
    int flags = SYNC_FILE_RANGE_WRITE;
810
    if (mode == FLUSH_SYNC) {
811
      flags |= SYNC_FILE_RANGE_WAIT_AFTER;
812
    }
813
    if (sync_file_range(fd_, offset, length, flags) < 0) {
814
      return STATUS_IO_ERROR(filename_, errno);
815
    }
816
#else
817
0
    if (fsync(fd_) < 0) {
818
0
      return STATUS_IO_ERROR(filename_, errno);
819
0
    }
820
0
#endif
821
0
    return Status::OK();
822
0
  }
823
824
0
  Status Sync() override {
825
0
    TRACE_EVENT1("io", "PosixRWFile::Sync", "path", filename_);
826
0
    ThreadRestrictions::AssertIOAllowed();
827
0
    LOG_SLOW_EXECUTION(WARNING, 1000, Substitute("sync call for $0", filename())) {
828
0
      if (pending_sync_) {
829
0
        pending_sync_ = false;
830
0
        RETURN_NOT_OK(DoSync(fd_, filename_));
831
0
      }
832
0
    }
833
0
    return Status::OK();
834
0
  }
835
836
2
  Status Close() override {
837
2
    TRACE_EVENT1("io", "PosixRWFile::Close", "path", filename_);
838
2
    ThreadRestrictions::AssertIOAllowed();
839
2
    Status s;
840
841
2
    if (sync_on_close_) {
842
      // Virtual function call in destructor.
843
0
      s = Sync();
844
0
      if (!s.ok()) {
845
0
        LOG(ERROR) << "Unable to Sync " << filename_ << ": " << s.ToString();
846
0
      }
847
0
    }
848
849
2
    if (close(fd_) < 0) {
850
0
      if (s.ok()) {
851
0
        s = STATUS_IO_ERROR(filename_, errno);
852
0
      }
853
0
    }
854
855
2
    fd_ = -1;
856
2
    return s;
857
2
  }
858
859
2
  Status Size(uint64_t* size) const override {
860
2
    TRACE_EVENT1("io", "PosixRWFile::Size", "path", filename_);
861
2
    ThreadRestrictions::AssertIOAllowed();
862
2
    struct stat st;
863
2
    if (fstat(fd_, &st) == -1) {
864
0
      return STATUS_IO_ERROR(filename_, errno);
865
0
    }
866
2
    *size = st.st_size;
867
2
    return Status::OK();
868
2
  }
869
870
0
  const string& filename() const override {
871
0
    return filename_;
872
0
  }
873
874
 private:
875
  const std::string filename_;
876
  int fd_;
877
  bool sync_on_close_;
878
  bool pending_sync_;
879
};
880
881
// Set of pathnames that are locked, and a mutex for protecting changes to the set.
882
static std::set<std::string> locked_files;
883
static std::mutex mutex_locked_files;
884
885
// Helper function to lock/unlock file whose filename and file descriptor are passed in. Returns -1
886
// on failure and a value other than -1 (as mentioned in fcntl() doc) on success.
887
static int LockOrUnlock(const std::string& fname,
888
                        int fd,
889
                        bool lock,
890
8.45k
                        bool recursive_lock_ok) {
891
8.45k
  std::lock_guard<std::mutex> guard(mutex_locked_files);
892
8.45k
  if (lock) {
893
    // If recursive locks on the same file must be disallowed, but the specified file name already
894
    // exists in the locked_files set, then it is already locked, so we fail this lock attempt.
895
    // Otherwise, we insert the specified file name into locked_files. This check is needed because
896
    // fcntl() does not detect lock conflict if the fcntl is issued by the same thread that earlier
897
    // acquired this lock.
898
8.45k
    if (!locked_files.insert(fname).second && !recursive_lock_ok) {
899
0
      errno = ENOLCK;
900
0
      return -1;
901
0
    }
902
0
  } else {
903
    // If we are unlocking, then verify that we had locked it earlier, it should already exist in
904
    // locked_files. Remove it from locked_files.
905
0
    if (locked_files.erase(fname) != 1) {
906
0
      errno = ENOLCK;
907
0
      return -1;
908
0
    }
909
8.45k
  }
910
8.45k
  errno = 0;
911
8.45k
  struct flock f;
912
8.45k
  memset(&f, 0, sizeof(f));
913
8.45k
  f.l_type = (lock ? F_WRLCK : F_UNLCK);
914
8.45k
  f.l_whence = SEEK_SET;
915
8.45k
  f.l_start = 0;
916
8.45k
  f.l_len = 0; // Lock/unlock entire file.
917
8.45k
  int value = fcntl(fd, F_SETLK, &f);
918
8.45k
  if (value == -1 && lock) {
919
    // If there is an error in locking, then remove the pathname from locked_files.
920
0
    locked_files.erase(fname);
921
0
  }
922
8.45k
  return value;
923
8.45k
}
924
925
class PosixFileLock : public FileLock {
926
 public:
927
  int fd_;
928
  std::string filename;
929
};
930
931
class PosixEnv : public Env {
932
 public:
933
  PosixEnv();
934
  explicit PosixEnv(std::unique_ptr<FileFactory> file_factory);
935
112
  virtual ~PosixEnv() = default;
936
937
  virtual Status NewSequentialFile(const std::string& fname,
938
2.08k
                                   std::unique_ptr<SequentialFile>* result) override {
939
2.08k
    return file_factory_->NewSequentialFile(fname, result);
940
2.08k
  }
941
942
  virtual Status NewRandomAccessFile(const std::string& fname,
943
542k
                                     std::unique_ptr<RandomAccessFile>* result) override {
944
542k
    return file_factory_->NewRandomAccessFile(fname, result);
945
542k
  }
946
947
  virtual Status NewWritableFile(const std::string& fname,
948
1.34k
                                 std::unique_ptr<WritableFile>* result) override {
949
1.34k
    return file_factory_->NewWritableFile(fname, result);
950
1.34k
  }
951
952
  virtual Status NewWritableFile(const WritableFileOptions& opts,
953
                                 const std::string& fname,
954
5.90k
                                 std::unique_ptr<WritableFile>* result) override {
955
5.90k
    return file_factory_->NewWritableFile(opts, fname, result);
956
5.90k
  }
957
958
  virtual Status NewTempWritableFile(const WritableFileOptions& opts,
959
                                     const std::string& name_template,
960
                                     std::string* created_filename,
961
1.23M
                                     std::unique_ptr<WritableFile>* result) override {
962
1.23M
    return file_factory_->NewTempWritableFile(opts, name_template, created_filename, result);
963
1.23M
  }
964
965
  virtual Status NewRWFile(const string& fname,
966
1
                           std::unique_ptr<RWFile>* result) override {
967
1
    return file_factory_->NewRWFile(fname, result);
968
1
  }
969
970
  virtual Status NewRWFile(const RWFileOptions& opts,
971
                           const string& fname,
972
2
                           std::unique_ptr<RWFile>* result) override {
973
2
    return file_factory_->NewRWFile(opts, fname, result);
974
2
  }
975
976
914k
  bool FileExists(const std::string& fname) override {
977
914k
    TRACE_EVENT1("io", "PosixEnv::FileExists", "path", fname);
978
914k
    ThreadRestrictions::AssertIOAllowed();
979
914k
    return access(fname.c_str(), F_OK) == 0;
980
914k
  }
981
982
6
  virtual bool DirExists(const std::string& dname) override {
983
6
    TRACE_EVENT1("io", "PosixEnv::DirExists", "path", dname);
984
6
    ThreadRestrictions::AssertIOAllowed();
985
6
    struct stat statbuf;
986
6
    if (stat(dname.c_str(), &statbuf) == 0) {
987
3
      return S_ISDIR(statbuf.st_mode);
988
3
    }
989
3
    return false;
990
3
  }
991
992
  CHECKED_STATUS GetChildren(const std::string& dir,
993
                             ExcludeDots exclude_dots,
994
195k
                             std::vector<std::string>* result) override {
995
195k
    TRACE_EVENT1("io", "PosixEnv::GetChildren", "path", dir);
996
195k
    ThreadRestrictions::AssertIOAllowed();
997
195k
    result->clear();
998
195k
    DIR* d = opendir(dir.c_str());
999
195k
    if (d == nullptr) {
1000
0
      return STATUS_IO_ERROR(dir, errno);
1001
0
    }
1002
195k
    struct dirent* entry;
1003
    // TODO: lint: Consider using readdir_r(...) instead of readdir(...) for improved thread safety.
1004
615k
    while ((entry = readdir(d)) != nullptr) {
1005
419k
      if (exclude_dots && (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)) {
1006
198k
        continue;
1007
198k
      }
1008
221k
      result->push_back(entry->d_name);
1009
221k
    }
1010
195k
    closedir(d);
1011
195k
    return Status::OK();
1012
195k
  }
1013
1014
194k
  Status DeleteFile(const std::string& fname) override {
1015
194k
    TRACE_EVENT1("io", "PosixEnv::DeleteFile", "path", fname);
1016
194k
    ThreadRestrictions::AssertIOAllowed();
1017
194k
    Status result;
1018
194k
    if (unlink(fname.c_str()) != 0) {
1019
1.67k
      result = STATUS_IO_ERROR(fname, errno);
1020
1.67k
    }
1021
194k
    return result;
1022
194k
  };
1023
1024
1.66M
  Status CreateDir(const std::string& name) override {
1025
1.66M
    TRACE_EVENT1("io", "PosixEnv::CreateDir", "path", name);
1026
1.66M
    ThreadRestrictions::AssertIOAllowed();
1027
1.66M
    Status result;
1028
1.66M
    if (mkdir(name.c_str(), 0755) != 0) {
1029
1.01M
      result = STATUS_IO_ERROR(name, errno);
1030
1.01M
    }
1031
1.66M
    return result;
1032
1.66M
  };
1033
1034
122k
  Status DeleteDir(const std::string& name) override {
1035
122k
    TRACE_EVENT1("io", "PosixEnv::DeleteDir", "path", name);
1036
122k
    ThreadRestrictions::AssertIOAllowed();
1037
122k
    Status result;
1038
122k
    if (rmdir(name.c_str()) != 0) {
1039
0
      result = STATUS_IO_ERROR(name, errno);
1040
0
    }
1041
122k
    return result;
1042
122k
  };
1043
1044
2.15M
  Status SyncDir(const std::string& dirname) override {
1045
2.15M
    TRACE_EVENT1("io", "SyncDir", "path", dirname);
1046
2.15M
    ThreadRestrictions::AssertIOAllowed();
1047
2.15M
    if (FLAGS_never_fsync) return Status::OK();
1048
18.4E
    int dir_fd;
1049
18.4E
    if ((dir_fd = open(dirname.c_str(), O_DIRECTORY|O_RDONLY)) == -1) {
1050
0
      return STATUS_IO_ERROR(dirname, errno);
1051
0
    }
1052
18.4E
    ScopedFdCloser fd_closer(dir_fd);
1053
18.4E
    if (fsync(dir_fd) != 0) {
1054
0
      return STATUS_IO_ERROR(dirname, errno);
1055
0
    }
1056
18.4E
    return Status::OK();
1057
18.4E
  }
1058
1059
97.7k
  Status DeleteRecursively(const std::string &name) override {
1060
97.7k
    return Walk(name, POST_ORDER, std::bind(&PosixEnv::DeleteRecursivelyCb, this, _1, _2, _3));
1061
97.7k
  }
1062
1063
482k
  Result<uint64_t> GetFileSize(const std::string& fname) override {
1064
482k
    return file_factory_->GetFileSize(fname);
1065
482k
  }
1066
1067
4.50k
  Result<uint64_t> GetFileINode(const std::string& fname) override {
1068
4.50k
    return GetFileStat(
1069
4.50k
        fname, "PosixEnv::GetFileINode", [](const struct stat& sbuf) { return sbuf.st_ino; });
1070
4.50k
  }
1071
1072
0
  Result<uint64_t> GetFileSizeOnDisk(const std::string& fname) override {
1073
0
    return GetFileStat(
1074
0
        fname, "PosixEnv::GetFileSizeOnDisk", [](const struct stat& sbuf) {
1075
          // From stat(2):
1076
          //
1077
          //   The st_blocks field indicates the number of blocks allocated to
1078
          //   the file, 512-byte units. (This may be smaller than st_size/512
1079
          //   when the file has holes.)
1080
0
          return sbuf.st_blocks * 512;
1081
0
        });
1082
0
  }
1083
1084
3
  Result<uint64_t> GetBlockSize(const string& fname) override {
1085
3
    return GetFileStat(
1086
2
        fname, "PosixEnv::GetBlockSize", [](const struct stat& sbuf) { return sbuf.st_blksize; });
1087
3
  }
1088
1089
  CHECKED_STATUS LinkFile(const std::string& src,
1090
634
                          const std::string& target) override {
1091
634
    if (link(src.c_str(), target.c_str()) != 0) {
1092
0
      if (errno == EXDEV) {
1093
0
        return STATUS(NotSupported, "No cross FS links allowed");
1094
0
      }
1095
0
      return STATUS_IO_ERROR(src, errno);
1096
0
    }
1097
634
    return Status::OK();
1098
634
  }
1099
1100
444
  Result<std::string> ReadLink(const std::string& link) override {
1101
444
    char buf[PATH_MAX];
1102
444
    const auto len = readlink(link.c_str(), buf, sizeof(buf));
1103
444
    if (len > -1) {
1104
444
      return std::string(buf, buf + len);
1105
444
    }
1106
0
    return STATUS_IO_ERROR(link, errno);
1107
0
  }
1108
1109
1.23M
  Status RenameFile(const std::string& src, const std::string& target) override {
1110
1.23M
    TRACE_EVENT2("io", "PosixEnv::RenameFile", "src", src, "dst", target);
1111
1.23M
    ThreadRestrictions::AssertIOAllowed();
1112
1.23M
    Status result;
1113
1.23M
    if (rename(src.c_str(), target.c_str()) != 0) {
1114
0
      result = STATUS_IO_ERROR(Format("Rename $0 => $1", src, target), errno);
1115
0
    }
1116
1.23M
    return result;
1117
1.23M
  }
1118
1119
  virtual Status LockFile(const std::string& fname,
1120
                          FileLock** lock,
1121
12.2k
                          bool recursive_lock_ok) override {
1122
12.2k
    TRACE_EVENT1("io", "PosixEnv::LockFile", "path", fname);
1123
12.2k
    ThreadRestrictions::AssertIOAllowed();
1124
12.2k
    *lock = nullptr;
1125
12.2k
    Status result;
1126
12.2k
    int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
1127
12.2k
    if (fd < 0) {
1128
3.77k
      result = STATUS_IO_ERROR(fname, errno);
1129
8.45k
    } else if (LockOrUnlock(fname, fd, true /* lock */, recursive_lock_ok) == -1) {
1130
0
      result = STATUS_IO_ERROR("lock " + fname, errno);
1131
0
      close(fd);
1132
8.45k
    } else {
1133
8.45k
      auto my_lock = new PosixFileLock;
1134
8.45k
      my_lock->fd_ = fd;
1135
8.45k
      my_lock->filename = fname;
1136
8.45k
      *lock = my_lock;
1137
8.45k
    }
1138
12.2k
    return result;
1139
12.2k
  }
1140
1141
0
  Status UnlockFile(FileLock* lock) override {
1142
0
    TRACE_EVENT0("io", "PosixEnv::UnlockFile");
1143
0
    ThreadRestrictions::AssertIOAllowed();
1144
0
    PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
1145
0
    Status result;
1146
0
    if (LockOrUnlock(my_lock->filename,
1147
0
                     my_lock->fd_,
1148
0
                     false /* lock */,
1149
0
                     false /* recursive_lock_ok (unused when lock = false) */) == -1) {
1150
0
      result = STATUS_IO_ERROR("unlock", errno);
1151
0
    }
1152
0
    close(my_lock->fd_);
1153
0
    delete my_lock;
1154
0
    return result;
1155
0
  }
1156
1157
1.89k
  Status GetTestDirectory(std::string* result) override {
1158
1.89k
    string dir;
1159
1.89k
    const char* env = getenv("TEST_TMPDIR");
1160
1.89k
    if (env && env[0] != '\0') {
1161
1.89k
      dir = env;
1162
0
    } else {
1163
0
      char buf[100];
1164
0
      snprintf(buf, sizeof(buf), "/tmp/ybtest-%d", static_cast<int>(geteuid()));
1165
0
      dir = buf;
1166
0
    }
1167
    // Directory may already exist
1168
1.89k
    WARN_NOT_OK(CreateDir(dir), "Create test dir failed");
1169
    // /tmp may be a symlink, so canonicalize the path.
1170
1.89k
    return Canonicalize(dir, result);
1171
1.89k
  }
1172
1173
1.83G
  uint64_t gettid() override {
1174
    // Platform-independent thread ID.  We can't use pthread_self here,
1175
    // because that function returns a totally opaque ID, which can't be
1176
    // compared via normal means.
1177
1.83G
    if (thread_local_id == 0) {
1178
989k
      thread_local_id = Barrier_AtomicIncrement(&cur_thread_local_id_, 1);
1179
989k
    }
1180
1.83G
    return thread_local_id;
1181
1.83G
  }
1182
1183
20.0k
  uint64_t NowMicros() override {
1184
20.0k
    struct timeval tv;
1185
20.0k
    gettimeofday(&tv, nullptr);
1186
20.0k
    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
1187
20.0k
  }
1188
1189
211k
  uint64_t NowNanos() override {
1190
#if defined(__linux__) || defined(OS_FREEBSD)
1191
    struct timespec ts;
1192
    clock_gettime(CLOCK_MONOTONIC, &ts);
1193
    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
1194
#elif defined(__MACH__)
1195
211k
    clock_serv_t cclock;
1196
211k
    mach_timespec_t ts;
1197
211k
    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
1198
211k
    clock_get_time(cclock, &ts);
1199
211k
    mach_port_deallocate(mach_task_self(), cclock);
1200
211k
    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
1201
#else
1202
    return std::chrono::duration_cast<std::chrono::nanoseconds>(
1203
       std::chrono::steady_clock::now().time_since_epoch()).count();
1204
#endif
1205
211k
  }
1206
1207
0
  void SleepForMicroseconds(int micros) override {
1208
0
    ThreadRestrictions::AssertWaitAllowed();
1209
0
    SleepFor(MonoDelta::FromMicroseconds(micros));
1210
0
  }
1211
1212
24.5k
  Status GetExecutablePath(string* path) override {
1213
24.5k
    uint32_t size = 64;
1214
24.5k
    size_t len = 0;
1215
49.1k
    while (true) {
1216
49.1k
      std::unique_ptr<char[]> buf(new char[size]);
1217
#if defined(__linux__)
1218
      auto rc = readlink("/proc/self/exe", buf.get(), size);
1219
      if (rc == -1) {
1220
        return STATUS(IOError, "Unable to determine own executable path", "", Errno(errno));
1221
      } else if (rc >= size) {
1222
        // The buffer wasn't large enough
1223
        size *= 2;
1224
        continue;
1225
      }
1226
      len = rc;
1227
#elif defined(__APPLE__)
1228
49.1k
      if (_NSGetExecutablePath(buf.get(), &size) != 0) {
1229
        // The buffer wasn't large enough; 'size' has been updated.
1230
24.5k
        continue;
1231
24.5k
      }
1232
24.5k
      len = strlen(buf.get());
1233
#else
1234
#error Unsupported platform
1235
#endif
1236
1237
24.5k
      path->assign(buf.get(), len);
1238
24.5k
      break;
1239
24.5k
    }
1240
24.5k
    return Status::OK();
1241
24.5k
  }
1242
1243
68.9k
  Status IsDirectory(const string& path, bool* is_dir) override {
1244
68.9k
    TRACE_EVENT1("io", "PosixEnv::IsDirectory", "path", path);
1245
68.9k
    ThreadRestrictions::AssertIOAllowed();
1246
68.9k
    Status s;
1247
68.9k
    struct stat sbuf;
1248
68.9k
    if (stat(path.c_str(), &sbuf) != 0) {
1249
34.5k
      s = STATUS_IO_ERROR(path, errno);
1250
34.4k
    } else {
1251
34.4k
      *is_dir = S_ISDIR(sbuf.st_mode);
1252
34.4k
    }
1253
68.9k
    return s;
1254
68.9k
  }
1255
1256
3.62k
  Result<bool> IsExecutableFile(const std::string& path) override {
1257
3.62k
    TRACE_EVENT1("io", "PosixEnv::IsExecutableFile", "path", path);
1258
3.62k
    ThreadRestrictions::AssertIOAllowed();
1259
3.62k
    Status s;
1260
3.62k
    struct stat sbuf;
1261
3.62k
    if (stat(path.c_str(), &sbuf) != 0) {
1262
0
      if (errno == ENOENT) {
1263
        // If the file does not exist, we just return false.
1264
0
        return false;
1265
0
      }
1266
0
      return STATUS_IO_ERROR(path, errno);
1267
0
    }
1268
1269
3.62k
    return !S_ISDIR(sbuf.st_mode) && (sbuf.st_mode & S_IXUSR);
1270
3.62k
  }
1271
1272
97.8k
  Status Walk(const string& root, DirectoryOrder order, const WalkCallback& cb) override {
1273
97.8k
    TRACE_EVENT1("io", "PosixEnv::Walk", "path", root);
1274
97.8k
    ThreadRestrictions::AssertIOAllowed();
1275
    // Some sanity checks
1276
97.8k
    CHECK_NE(root, "/");
1277
97.8k
    CHECK_NE(root, "./");
1278
97.8k
    CHECK_NE(root, ".");
1279
97.8k
    CHECK_NE(root, "");
1280
1281
    // FTS requires a non-const copy of the name. strdup it and free() when
1282
    // we leave scope.
1283
97.8k
    std::unique_ptr<char, FreeDeleter> name_dup(strdup(root.c_str()));
1284
97.8k
    char *paths[] = { name_dup.get(), nullptr };
1285
1286
    // FTS_NOCHDIR is important here to make this thread-safe.
1287
97.8k
    std::unique_ptr<FTS, FtsCloser> tree(
1288
97.8k
        fts_open(paths, FTS_PHYSICAL | FTS_XDEV | FTS_NOCHDIR, nullptr));
1289
97.8k
    if (!tree.get()) {
1290
0
      return STATUS_IO_ERROR(root, errno);
1291
0
    }
1292
1293
97.8k
    FTSENT *ent = nullptr;
1294
97.8k
    bool had_errors = false;
1295
486k
    while ((ent = fts_read(tree.get())) != nullptr) {
1296
389k
      bool doCb = false;
1297
389k
      FileType type = DIRECTORY_TYPE;
1298
389k
      switch (ent->fts_info) {
1299
122k
        case FTS_D:         // Directory in pre-order
1300
122k
          if (order == PRE_ORDER) {
1301
5
            doCb = true;
1302
5
          }
1303
122k
          break;
1304
122k
        case FTS_DP:        // Directory in post-order
1305
122k
          if (order == POST_ORDER) {
1306
122k
            doCb = true;
1307
122k
          }
1308
122k
          break;
1309
144k
        case FTS_F:         // A regular file
1310
144k
        case FTS_SL:        // A symbolic link
1311
144k
        case FTS_SLNONE:    // A broken symbolic link
1312
144k
        case FTS_DEFAULT:   // Unknown type of file
1313
144k
          doCb = true;
1314
144k
          type = FILE_TYPE;
1315
144k
          break;
1316
1317
0
        case FTS_ERR:
1318
0
          LOG(WARNING) << "Unable to access file " << ent->fts_path
1319
0
                       << " during walk: " << strerror(ent->fts_errno);
1320
0
          had_errors = true;
1321
0
          break;
1322
1323
154
        default:
1324
154
          LOG(WARNING) << "Unable to access file " << ent->fts_path
1325
154
                       << " during walk (code " << ent->fts_info << ")";
1326
154
          break;
1327
389k
      }
1328
389k
      if (doCb) {
1329
266k
        if (!cb(type, DirName(ent->fts_path), ent->fts_name).ok()) {
1330
2
          had_errors = true;
1331
2
        }
1332
266k
      }
1333
389k
    }
1334
1335
97.7k
    if (had_errors) {
1336
1
      return STATUS(IOError, root, "One or more errors occurred");
1337
1
    }
1338
97.7k
    return Status::OK();
1339
97.7k
  }
1340
1341
19.7k
  Status Canonicalize(const string& path, string* result) override {
1342
19.7k
    TRACE_EVENT1("io", "PosixEnv::Canonicalize", "path", path);
1343
19.7k
    ThreadRestrictions::AssertIOAllowed();
1344
19.7k
    std::unique_ptr<char[], FreeDeleter> r(realpath(path.c_str(), nullptr));
1345
19.7k
    if (!r) {
1346
1
      return STATUS_IO_ERROR(path, errno);
1347
1
    }
1348
19.7k
    *result = string(r.get());
1349
19.7k
    return Status::OK();
1350
19.7k
  }
1351
1352
41
  Status GetTotalRAMBytes(int64_t* ram) override {
1353
41
#if defined(__APPLE__)
1354
41
    int mib[2];
1355
41
    size_t length = sizeof(*ram);
1356
1357
    // Get the Physical memory size
1358
41
    mib[0] = CTL_HW;
1359
41
    mib[1] = HW_MEMSIZE;
1360
0
    CHECK_ERR(sysctl(mib, 2, ram, &length, nullptr, 0)) << "sysctl CTL_HW HW_MEMSIZE failed";
1361
#else
1362
    struct sysinfo info;
1363
    if (sysinfo(&info) < 0) {
1364
      return STATUS_IO_ERROR("sysinfo() failed", errno);
1365
    }
1366
    *ram = info.totalram;
1367
#endif
1368
41
    return Status::OK();
1369
41
  }
1370
1371
6.10k
  FileFactory* GetFileFactory() {
1372
6.10k
    return file_factory_.get();
1373
6.10k
  }
1374
1375
10
  Result<uint64_t> GetFreeSpaceBytes(const std::string& path) override {
1376
10
    if (PREDICT_FALSE(FLAGS_TEST_simulate_free_space_bytes >= 0)) {
1377
0
      return FLAGS_TEST_simulate_free_space_bytes;
1378
0
    }
1379
1380
10
    auto stat = GetFilesystemStats(path);
1381
10
    RETURN_NOT_OK(stat);
1382
10
    uint64_t block_size = stat->f_frsize > 0 ? static_cast<uint64_t>(stat->f_frsize) :
1383
0
                                               static_cast<uint64_t>(stat->f_bsize);
1384
10
    uint64_t available_blocks = static_cast<uint64_t>(stat->f_bavail);
1385
1386
10
    return available_blocks * block_size;
1387
10
  }
1388
1389
38.3k
  Result<FilesystemStats> GetFilesystemStatsBytes(const std::string& path) override {
1390
38.3k
    auto stat = GetFilesystemStats(path);
1391
38.3k
    RETURN_NOT_OK(stat);
1392
25.0k
    uint64_t block_size = stat->f_frsize > 0 ? static_cast<uint64_t>(stat->f_frsize) :
1393
0
                                               static_cast<uint64_t>(stat->f_bsize);
1394
25.0k
    uint64_t available_blocks = static_cast<uint64_t>(stat->f_bavail);
1395
25.0k
    uint64_t total_blocks = static_cast<uint64_t>(stat->f_blocks);
1396
1397
25.0k
    return FilesystemStats{available_blocks * block_size,
1398
25.0k
                           (total_blocks - available_blocks) * block_size,
1399
25.0k
                           total_blocks * block_size};
1400
38.3k
  }
1401
1402
196k
  Result<ResourceLimits> GetUlimit(int resource) override {
1403
196k
    struct rlimit lim;
1404
196k
    if (getrlimit(resource, &lim) != 0) {
1405
0
      return STATUS_IO_ERROR("getrlimit() failed", errno);
1406
0
    }
1407
196k
    ResourceLimit soft(lim.rlim_cur);
1408
196k
    ResourceLimit hard(lim.rlim_max);
1409
196k
    ResourceLimits limits { soft, hard };
1410
196k
    return limits;
1411
196k
  }
1412
1413
0
  CHECKED_STATUS SetUlimit(int resource, ResourceLimit value) override {
1414
0
    return SetUlimit(resource, value, strings::Substitute("resource no. $0", resource));
1415
0
  }
1416
1417
  CHECKED_STATUS SetUlimit(
1418
20.6k
      int resource, ResourceLimit value, const std::string& resource_name) override {
1419
1420
20.6k
    auto limits = VERIFY_RESULT(GetUlimit(resource));
1421
20.6k
    if (limits.soft == value) {
1422
10.3k
      return Status::OK();
1423
10.3k
    }
1424
10.3k
    if (limits.hard < value) {
1425
0
      return STATUS_FORMAT(
1426
0
        InvalidArgument,
1427
0
        "Resource limit value $0 for resource $1 greater than hard limit $2",
1428
0
        value, resource, limits.hard.ToString());
1429
0
    }
1430
10.3k
    struct rlimit lim;
1431
10.3k
    lim.rlim_cur = value.RawValue();
1432
10.3k
    lim.rlim_max = limits.hard.RawValue();
1433
10.3k
    LOG(INFO)
1434
10.3k
        << "Modifying limit for " << resource_name
1435
10.3k
        << " from " << limits.soft.ToString()
1436
10.3k
        << " to " << value.ToString();
1437
10.3k
    if (setrlimit(resource, &lim) != 0) {
1438
0
      return STATUS(RuntimeError, "Unable to set rlimit", Errno(errno));
1439
0
    }
1440
10.3k
    return Status::OK();
1441
10.3k
  }
1442
1443
0
  bool IsEncrypted() const override {
1444
0
    return file_factory_->IsEncrypted();
1445
0
  }
1446
1447
 private:
1448
  // std::unique_ptr Deleter implementation for fts_close
1449
  struct FtsCloser {
1450
97.8k
    void operator()(FTS *fts) const {
1451
97.8k
      if (fts) { fts_close(fts); }
1452
97.8k
    }
1453
  };
1454
1455
266k
  Status DeleteRecursivelyCb(FileType type, const string& dirname, const string& basename) {
1456
266k
    string full_path = JoinPathSegments(dirname, basename);
1457
266k
    Status s;
1458
266k
    switch (type) {
1459
144k
      case FILE_TYPE:
1460
144k
        s = DeleteFile(full_path);
1461
144k
        WARN_NOT_OK(s, "Could not delete file");
1462
144k
        return s;
1463
122k
      case DIRECTORY_TYPE:
1464
122k
        s = DeleteDir(full_path);
1465
122k
        WARN_NOT_OK(s, "Could not delete directory");
1466
122k
        return s;
1467
0
      default:
1468
0
        LOG(FATAL) << "Unknown file type: " << type;
1469
0
        return Status::OK();
1470
266k
    }
1471
266k
  }
1472
1473
  std::unique_ptr<FileFactory> file_factory_;
1474
};
1475
1476
class PosixFileFactory : public FileFactory {
1477
#if defined(__linux__)
1478
  static constexpr int kODirectFlags = O_DIRECT | O_NOATIME | O_SYNC;
1479
#else
1480
  static constexpr int kODirectFlags = 0;
1481
#endif
1482
1483
 public:
1484
15.7k
  PosixFileFactory() {}
1485
0
  ~PosixFileFactory() {}
1486
1487
  Status NewSequentialFile(
1488
2.08k
      const std::string& fname, std::unique_ptr<SequentialFile>* result) override {
1489
2.08k
    TRACE_EVENT1("io", "PosixEnv::NewSequentialFile", "path", fname);
1490
2.08k
    ThreadRestrictions::AssertIOAllowed();
1491
2.08k
    FILE* f = fopen(fname.c_str(), "r");
1492
2.08k
    if (f == nullptr) {
1493
0
      return STATUS_IO_ERROR(fname, errno);
1494
2.08k
    } else {
1495
2.08k
      result->reset(new yb::PosixSequentialFile(fname, f, yb::FileSystemOptions::kDefault));
1496
2.08k
      return Status::OK();
1497
2.08k
    }
1498
2.08k
  }
1499
1500
  Status NewRandomAccessFile(const std::string& fname,
1501
542k
                             std::unique_ptr<RandomAccessFile>* result) override {
1502
542k
    TRACE_EVENT1("io", "PosixEnv::NewRandomAccessFile", "path", fname);
1503
542k
    ThreadRestrictions::AssertIOAllowed();
1504
542k
    int fd = open(fname.c_str(), O_RDONLY);
1505
542k
    if (fd < 0) {
1506
17.8k
      return STATUS_IO_ERROR(fname, errno);
1507
17.8k
    }
1508
1509
524k
    result->reset(new yb::PosixRandomAccessFile(fname, fd, yb::FileSystemOptions::kDefault));
1510
524k
    return Status::OK();
1511
524k
  }
1512
1513
  Status NewWritableFile(const std::string& fname,
1514
1.34k
                         std::unique_ptr<WritableFile>* result) override {
1515
1.34k
    return NewWritableFile(WritableFileOptions(), fname, result);
1516
1.34k
  }
1517
1518
  Status NewWritableFile(const WritableFileOptions& opts,
1519
                         const std::string& fname,
1520
7.24k
                         std::unique_ptr<WritableFile>* result) override {
1521
7.24k
    TRACE_EVENT1("io", "PosixEnv::NewWritableFile", "path", fname);
1522
7.24k
    int fd = -1;
1523
7.24k
    int extra_flags = 0;
1524
7.24k
    if (UseODirect(opts.o_direct)) {
1525
0
      extra_flags = kODirectFlags;
1526
0
    }
1527
7.24k
    RETURN_NOT_OK(DoOpen(fname, opts.mode, &fd, extra_flags));
1528
7.24k
    return InstantiateNewWritableFile(fname, fd, opts, result);
1529
7.24k
  }
1530
1531
  Status NewTempWritableFile(const WritableFileOptions& opts,
1532
                             const std::string& name_template,
1533
                             std::string* created_filename,
1534
1.23M
                             std::unique_ptr<WritableFile>* result) override {
1535
1.23M
    TRACE_EVENT1("io", "PosixEnv::NewTempWritableFile", "template", name_template);
1536
1.23M
    ThreadRestrictions::AssertIOAllowed();
1537
1.23M
    std::unique_ptr<char[]> fname(new char[name_template.size() + 1]);
1538
1.23M
    ::snprintf(fname.get(), name_template.size() + 1, "%s", name_template.c_str());
1539
1.23M
    int fd = -1;
1540
1.23M
    if (UseODirect(opts.o_direct)) {
1541
0
      fd = ::mkostemp(fname.get(), kODirectFlags);
1542
1.23M
    } else {
1543
1.23M
      fd = ::mkstemp(fname.get());
1544
1.23M
    }
1545
1.23M
    if (fd < 0) {
1546
4
      return STATUS_IO_ERROR(Format("Call to mkstemp() failed on name template $0", name_template),
1547
4
                             errno);
1548
4
    }
1549
1.23M
    *created_filename = fname.get();
1550
1.23M
    return InstantiateNewWritableFile(*created_filename, fd, opts, result);
1551
1.23M
  }
1552
1553
1
  Status NewRWFile(const string& fname, std::unique_ptr<RWFile>* result) override {
1554
1
    return NewRWFile(RWFileOptions(), fname, result);
1555
1
  }
1556
1557
  Status NewRWFile(const RWFileOptions& opts, const string& fname,
1558
3
                   std::unique_ptr<RWFile>* result) override {
1559
3
    TRACE_EVENT1("io", "PosixEnv::NewRWFile", "path", fname);
1560
3
    int fd = -1;
1561
3
    RETURN_NOT_OK(DoOpen(fname, opts.mode, &fd));
1562
2
    result->reset(new PosixRWFile(fname, fd, opts.sync_on_close));
1563
2
    return Status::OK();
1564
3
  }
1565
1566
482k
  Result<uint64_t> GetFileSize(const std::string& fname) override {
1567
482k
    return GetFileStat(
1568
482k
        fname, "PosixEnv::GetFileSize", [](const struct stat& sbuf) { return sbuf.st_size; });
1569
482k
  }
1570
1571
0
  bool IsEncrypted() const override {
1572
0
    return false;
1573
0
  }
1574
1575
 private:
1576
1.23M
  bool UseODirect(bool o_direct) {
1577
#if defined(__linux__)
1578
    return o_direct;
1579
#else
1580
1.23M
    return false;
1581
1.23M
#endif
1582
1.23M
  }
1583
1584
  Status InstantiateNewWritableFile(const std::string& fname,
1585
                                    int fd,
1586
                                    const WritableFileOptions& opts,
1587
1.23M
                                    std::unique_ptr<WritableFile>* result) {
1588
1.23M
    uint64_t file_size = 0;
1589
1.23M
    if (opts.mode == PosixEnv::OPEN_EXISTING) {
1590
1
      auto lseek_result = lseek(fd, 0, SEEK_END);
1591
1
      if (lseek_result < 0) {
1592
0
        return STATUS_IO_ERROR(fname, errno);
1593
0
      }
1594
1
      file_size = lseek_result;
1595
1
    }
1596
#if defined(__linux__)
1597
    if (opts.o_direct)
1598
      *result = std::make_unique<PosixDirectIOWritableFile>(
1599
          fname, fd, file_size, opts.sync_on_close);
1600
    else
1601
#endif
1602
1.23M
    *result = std::make_unique<PosixWritableFile>(fname, fd, file_size, opts.sync_on_close);
1603
1.23M
    return Status::OK();
1604
1.23M
  }
1605
};
1606
1607
15.7k
PosixEnv::PosixEnv() : file_factory_(std::make_unique<PosixFileFactory>()) {}
1608
PosixEnv::PosixEnv(std::unique_ptr<FileFactory> file_factory) :
1609
6.10k
  file_factory_(std::move(file_factory)) {}
1610
1611
}  // namespace
1612
1613
static pthread_once_t once = PTHREAD_ONCE_INIT;
1614
static Env* default_env;
1615
15.7k
static void InitDefaultEnv() { default_env = new PosixEnv; }
1616
1617
1.83G
Env* Env::Default() {
1618
1.83G
  pthread_once(&once, InitDefaultEnv);
1619
1.83G
  return default_env;
1620
1.83G
}
1621
1622
6.10k
FileFactory* Env::DefaultFileFactory() {
1623
6.10k
  return down_cast<PosixEnv*>(Env::Default())->GetFileFactory();
1624
6.10k
}
1625
1626
6.10k
std::unique_ptr<Env> Env::NewDefaultEnv(std::unique_ptr<FileFactory> file_factory) {
1627
6.10k
  return std::make_unique<PosixEnv>(std::move(file_factory));
1628
6.10k
}
1629
1630
1631
}  // namespace yb