/Users/deen/code/yugabyte-db/src/yb/util/env_posix.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
2 | | // Use of this source code is governed by a BSD-style license that can be |
3 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
4 | | // |
5 | | // The following only applies to changes made to this file as part of YugaByte development. |
6 | | // |
7 | | // Portions Copyright (c) YugaByte, Inc. |
8 | | // |
9 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
10 | | // in compliance with the License. You may obtain a copy of the License at |
11 | | // |
12 | | // http://www.apache.org/licenses/LICENSE-2.0 |
13 | | // |
14 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
15 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
16 | | // or implied. See the License for the specific language governing permissions and limitations |
17 | | // under the License. |
18 | | // |
19 | | |
20 | | #include <dirent.h> |
21 | | #include <fcntl.h> |
22 | | #include <fts.h> |
23 | | #include <pthread.h> |
24 | | #include <stdio.h> |
25 | | #include <stdlib.h> |
26 | | #include <string.h> |
27 | | #include <sys/stat.h> |
28 | | #include <sys/statvfs.h> |
29 | | #include <sys/time.h> |
30 | | #include <sys/types.h> |
31 | | #include <sys/uio.h> |
32 | | #include <time.h> |
33 | | |
34 | | #include <set> |
35 | | #include <vector> |
36 | | |
37 | | #if defined(__APPLE__) |
38 | | #include <mach-o/dyld.h> |
39 | | #include <sys/sysctl.h> |
40 | | #else |
41 | | #include <linux/falloc.h> |
42 | | #include <sys/sysinfo.h> |
43 | | #endif // defined(__APPLE__) |
44 | | #include <sys/resource.h> |
45 | | |
46 | | #include <glog/logging.h> |
47 | | |
48 | | #include "yb/gutil/atomicops.h" |
49 | | #include "yb/gutil/bind.h" |
50 | | #include "yb/gutil/callback.h" |
51 | | #include "yb/gutil/casts.h" |
52 | | #include "yb/gutil/map-util.h" |
53 | | #include "yb/gutil/strings/substitute.h" |
54 | | |
55 | | #include "yb/util/alignment.h" |
56 | | #include "yb/util/debug/trace_event.h" |
57 | | #include "yb/util/env.h" |
58 | | #include "yb/util/errno.h" |
59 | | #include "yb/util/file_system_posix.h" |
60 | | #include "yb/util/flag_tags.h" |
61 | | #include "yb/util/format.h" |
62 | | #include "yb/util/locks.h" |
63 | | #include "yb/util/logging.h" |
64 | | #include "yb/util/malloc.h" |
65 | | #include "yb/util/monotime.h" |
66 | | #include "yb/util/path_util.h" |
67 | | #include "yb/util/result.h" |
68 | | #include "yb/util/slice.h" |
69 | | #include "yb/util/status.h" |
70 | | #include "yb/util/status_format.h" |
71 | | #include "yb/util/status_log.h" |
72 | | #include "yb/util/stopwatch.h" |
73 | | #include "yb/util/thread_restrictions.h" |
74 | | |
75 | | // Copied from falloc.h. Useful for older kernels that lack support for |
76 | | // hole punching; fallocate(2) will return EOPNOTSUPP. |
77 | | #ifndef FALLOC_FL_KEEP_SIZE |
78 | | #define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */ |
79 | | #endif |
80 | | #ifndef FALLOC_FL_PUNCH_HOLE |
81 | | #define FALLOC_FL_PUNCH_HOLE 0x02 /* de-allocates range */ |
82 | | #endif |
83 | | |
84 | | // For platforms without fdatasync (like OS X) |
85 | | #ifndef fdatasync |
86 | 626 | #define fdatasync fsync |
87 | | #endif |
88 | | |
89 | | // For platforms without unlocked_stdio (like OS X) |
90 | | #ifndef fread_unlocked |
91 | | #define fread_unlocked fread |
92 | | #endif |
93 | | |
94 | | // See KUDU-588 for details. |
95 | | DEFINE_bool(writable_file_use_fsync, false, |
96 | | "Use fsync(2) instead of fdatasync(2) for synchronizing dirty " |
97 | | "data to disk."); |
98 | | TAG_FLAG(writable_file_use_fsync, advanced); |
99 | | |
100 | | #ifdef __APPLE__ |
101 | | // Never fsync on Mac OS X as we are getting many slow fsync errors in Jenkins and the fsync |
102 | | // implementation is very different in production (on Linux) anyway. |
103 | | #define FLAGS_never_fsync_default true |
104 | | #else |
105 | | #define FLAGS_never_fsync_default false |
106 | | #endif |
107 | | |
108 | | DEFINE_bool(never_fsync, FLAGS_never_fsync_default, |
109 | | "Never fsync() anything to disk. This is used by tests to speed up runtime and improve " |
110 | | "stability. This is very unsafe to use in production."); |
111 | | |
112 | | TAG_FLAG(never_fsync, advanced); |
113 | | TAG_FLAG(never_fsync, unsafe); |
114 | | |
115 | | DEFINE_int32(o_direct_block_size_bytes, 4096, |
116 | | "Size of the block to use when flag durable_wal_write is set."); |
117 | | TAG_FLAG(o_direct_block_size_bytes, advanced); |
118 | | |
119 | | DEFINE_int32(o_direct_block_alignment_bytes, 4096, |
120 | | "Alignment (in bytes) for blocks used for O_DIRECT operations."); |
121 | | TAG_FLAG(o_direct_block_alignment_bytes, advanced); |
122 | | |
123 | | DEFINE_test_flag(bool, simulate_fs_without_fallocate, false, |
124 | | "If true, the system simulates a file system that doesn't support fallocate."); |
125 | | |
126 | | DEFINE_test_flag(int64, simulate_free_space_bytes, -1, |
127 | | "If a non-negative value, GetFreeSpaceBytes will return the specified value."); |
128 | | |
129 | | DECLARE_bool(never_fsync); |
130 | | |
131 | | using namespace std::placeholders; |
132 | | using base::subtle::Atomic64; |
133 | | using base::subtle::Barrier_AtomicIncrement; |
134 | | using std::vector; |
135 | | using strings::Substitute; |
136 | | |
137 | | static __thread uint64_t thread_local_id; |
138 | | static Atomic64 cur_thread_local_id_; |
139 | | |
140 | | namespace yb { |
141 | | |
142 | | namespace { |
143 | | |
144 | | #if defined(__APPLE__) |
145 | | // Simulates Linux's fallocate file preallocation API on OS X. |
146 | 160k | int fallocate(int fd, int mode, off_t offset, off_t len) { |
147 | 160k | CHECK_EQ(mode, 0); |
148 | 160k | off_t size = offset + len; |
149 | | |
150 | 160k | struct stat stat; |
151 | 160k | int ret = fstat(fd, &stat); |
152 | 160k | if (ret < 0) { |
153 | 0 | return ret; |
154 | 0 | } |
155 | | |
156 | 160k | if (160k stat.st_blocks * 512 < size160k ) { |
157 | | // The offset field seems to have no effect; the file is always allocated |
158 | | // with space from 0 to the size. This is probably because OS X does not |
159 | | // support sparse files. |
160 | 160k | fstore_t store = {F_ALLOCATECONTIG, F_PEOFPOSMODE, 0, size}; |
161 | 160k | if (fcntl(fd, F_PREALLOCATE, &store) < 0) { |
162 | 0 | LOG(INFO) << "Unable to allocate contiguous disk space, attempting non-contiguous allocation"; |
163 | 0 | store.fst_flags = F_ALLOCATEALL; |
164 | 0 | ret = fcntl(fd, F_PREALLOCATE, &store); |
165 | 0 | if (ret < 0) { |
166 | 0 | return ret; |
167 | 0 | } |
168 | 0 | } |
169 | 160k | } |
170 | | |
171 | 160k | if (160k stat.st_size < size160k ) { |
172 | | // fcntl does not change the file size, so set it if necessary. |
173 | 160k | return ftruncate(fd, size); |
174 | 160k | } |
175 | 18.4E | return 0; |
176 | 160k | } |
177 | | #endif |
178 | | |
179 | | // Close file descriptor when object goes out of scope. |
180 | | class ScopedFdCloser { |
181 | | public: |
182 | | explicit ScopedFdCloser(int fd) |
183 | 1 | : fd_(fd) { |
184 | 1 | } |
185 | | |
186 | 1 | ~ScopedFdCloser() { |
187 | 1 | ThreadRestrictions::AssertIOAllowed(); |
188 | 1 | int err = ::close(fd_); |
189 | 1 | if (PREDICT_FALSE(err != 0)) { |
190 | 0 | PLOG(WARNING) << "Failed to close fd " << fd_; |
191 | 0 | } |
192 | 1 | } |
193 | | |
194 | | private: |
195 | | int fd_; |
196 | | }; |
197 | | |
198 | | #define STATUS_IO_ERROR(context, err_number) \ |
199 | 2.20M | STATUS_FROM_ERRNO_SPECIAL_EIO_HANDLING(context, err_number) |
200 | | |
201 | 2.74M | static Status DoSync(int fd, const string& filename) { |
202 | 2.74M | ThreadRestrictions::AssertIOAllowed(); |
203 | 2.74M | if (FLAGS_never_fsync) { |
204 | 2.74M | return Status::OK(); |
205 | 2.74M | } |
206 | 626 | if (FLAGS_writable_file_use_fsync) { |
207 | 0 | if (fsync(fd) < 0) { |
208 | 0 | return STATUS_IO_ERROR(filename, errno); |
209 | 0 | } |
210 | 626 | } else { |
211 | 626 | if (fdatasync(fd) < 0) { |
212 | 0 | return STATUS_IO_ERROR(filename, errno); |
213 | 0 | } |
214 | 626 | } |
215 | 626 | return Status::OK(); |
216 | 626 | } |
217 | | |
218 | 13.7k | static Status DoOpen(const string& filename, Env::CreateMode mode, int* fd, int extra_flags = 0) { |
219 | 13.7k | ThreadRestrictions::AssertIOAllowed(); |
220 | 13.7k | int flags = O_RDWR; |
221 | 13.7k | switch (mode) { |
222 | 13.7k | case Env::CREATE_IF_NON_EXISTING_TRUNCATE: |
223 | 13.7k | flags |= O_CREAT | O_TRUNC; |
224 | 13.7k | break; |
225 | 6 | case Env::CREATE_NON_EXISTING: |
226 | 6 | flags |= O_CREAT | O_EXCL; |
227 | 6 | break; |
228 | 2 | case Env::OPEN_EXISTING: |
229 | 2 | break; |
230 | 0 | default: |
231 | 0 | return STATUS(NotSupported, Substitute("Unknown create mode $0", mode)); |
232 | 13.7k | } |
233 | | |
234 | 13.7k | const int f = open(filename.c_str(), flags | extra_flags, 0644); |
235 | 13.7k | if (f < 0) { |
236 | 3 | return STATUS_IO_ERROR(filename, errno); |
237 | 3 | } |
238 | 13.7k | *fd = f; |
239 | 13.7k | return Status::OK(); |
240 | 13.7k | } |
241 | | |
242 | | template <class Extractor> |
243 | 841k | Result<uint64_t> GetFileStat(const std::string& fname, const char* event, Extractor extractor) { |
244 | 841k | TRACE_EVENT1("io", event, "path", fname); |
245 | 841k | ThreadRestrictions::AssertIOAllowed(); |
246 | 841k | struct stat sbuf; |
247 | 841k | if (stat(fname.c_str(), &sbuf) != 0) { |
248 | 1 | return STATUS_IO_ERROR(fname, errno); |
249 | 1 | } |
250 | 841k | return extractor(sbuf); |
251 | 841k | } env_posix.cc:yb::Result<unsigned long long> yb::(anonymous namespace)::GetFileStat<yb::(anonymous namespace)::PosixFileFactory::GetFileSize(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)::'lambda'(stat const&)>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char const*, yb::(anonymous namespace)::PosixFileFactory::GetFileSize(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)::'lambda'(stat const&)) Line | Count | Source | 243 | 834k | Result<uint64_t> GetFileStat(const std::string& fname, const char* event, Extractor extractor) { | 244 | 834k | TRACE_EVENT1("io", event, "path", fname); | 245 | 834k | ThreadRestrictions::AssertIOAllowed(); | 246 | 834k | struct stat sbuf; | 247 | 834k | if (stat(fname.c_str(), &sbuf) != 0) { | 248 | 0 | return STATUS_IO_ERROR(fname, errno); | 249 | 0 | } | 250 | 834k | return extractor(sbuf); | 251 | 834k | } |
env_posix.cc:yb::Result<unsigned long long> yb::(anonymous namespace)::GetFileStat<yb::(anonymous namespace)::PosixEnv::GetFileINode(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)::'lambda'(stat const&)>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char const*, yb::(anonymous namespace)::PosixEnv::GetFileINode(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)::'lambda'(stat const&)) Line | Count | Source | 243 | 6.66k | Result<uint64_t> GetFileStat(const std::string& fname, const char* event, Extractor extractor) { | 244 | 6.66k | TRACE_EVENT1("io", event, "path", fname); | 245 | 6.66k | ThreadRestrictions::AssertIOAllowed(); | 246 | 6.66k | struct stat sbuf; | 247 | 6.66k | if (stat(fname.c_str(), &sbuf) != 0) { | 248 | 0 | return STATUS_IO_ERROR(fname, errno); | 249 | 0 | } | 250 | 6.66k | return extractor(sbuf); | 251 | 6.66k | } |
Unexecuted instantiation: env_posix.cc:yb::Result<unsigned long long> yb::(anonymous namespace)::GetFileStat<yb::(anonymous namespace)::PosixEnv::GetFileSizeOnDisk(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)::'lambda'(stat const&)>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char const*, yb::(anonymous namespace)::PosixEnv::GetFileSizeOnDisk(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)::'lambda'(stat const&)) env_posix.cc:yb::Result<unsigned long long> yb::(anonymous namespace)::GetFileStat<yb::(anonymous namespace)::PosixEnv::GetBlockSize(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)::'lambda'(stat const&)>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, char const*, yb::(anonymous namespace)::PosixEnv::GetBlockSize(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&)::'lambda'(stat const&)) Line | Count | Source | 243 | 3 | Result<uint64_t> GetFileStat(const std::string& fname, const char* event, Extractor extractor) { | 244 | 3 | TRACE_EVENT1("io", event, "path", fname); | 245 | 3 | ThreadRestrictions::AssertIOAllowed(); | 246 | 3 | struct stat sbuf; | 247 | 3 | if (stat(fname.c_str(), &sbuf) != 0) { | 248 | 1 | return STATUS_IO_ERROR(fname, errno); | 249 | 1 | } | 250 | 2 | return extractor(sbuf); | 251 | 3 | } |
|
252 | | |
253 | 1.08M | Result<struct statvfs> GetFilesystemStats(const std::string& path) { |
254 | 1.08M | struct statvfs stat; |
255 | 1.08M | auto ret = statvfs(path.c_str(), &stat); |
256 | 1.08M | if (ret != 0) { |
257 | 873k | if (errno == EACCES) { |
258 | 0 | return STATUS_SUBSTITUTE(NotAuthorized, |
259 | 0 | "Caller doesn't have the required permission on a component of the path $0", |
260 | 0 | path); |
261 | 873k | } else if (errno == EIO) { |
262 | 0 | return STATUS_SUBSTITUTE(IOError, |
263 | 0 | "I/O error occurred while reading from '$0' filesystem", |
264 | 0 | path); |
265 | 873k | } else if (errno == ELOOP) { |
266 | 0 | return STATUS_SUBSTITUTE(InternalError, |
267 | 0 | "Too many symbolic links while translating '$0' path", |
268 | 0 | path); |
269 | 873k | } else if (errno == ENAMETOOLONG) { |
270 | 0 | return STATUS_SUBSTITUTE(NotSupported, |
271 | 0 | "Path '$0' is too long", |
272 | 0 | path); |
273 | 873k | } else if (errno == ENOENT) { |
274 | 873k | return STATUS_SUBSTITUTE(NotFound, |
275 | 873k | "File specified by path '$0' doesn't exist", |
276 | 873k | path); |
277 | 873k | } else if (errno0 == ENOMEM0 ) { |
278 | 0 | return STATUS(InternalError, "Insufficient memory"); |
279 | 0 | } else if (errno == ENOSYS) { |
280 | 0 | return STATUS_SUBSTITUTE(NotSupported, |
281 | 0 | "Filesystem for path '$0' doesn't support statvfs", |
282 | 0 | path); |
283 | 0 | } else if (errno == ENOTDIR) { |
284 | 0 | return STATUS_SUBSTITUTE(InvalidArgument, |
285 | 0 | "A component of the path '$0' is not a directory", |
286 | 0 | path); |
287 | 0 | } else { |
288 | 0 | return STATUS_SUBSTITUTE(InternalError, |
289 | 0 | "Failed to read information about filesystem for path '%s': errno=$0: $1", |
290 | 0 | path, |
291 | 0 | errno, |
292 | 0 | ErrnoToString(errno)); |
293 | 0 | } |
294 | 873k | } |
295 | | |
296 | 214k | return stat; |
297 | 1.08M | } |
298 | | |
299 | | // Use non-memory mapped POSIX files to write data to a file. |
300 | | // |
301 | | // TODO (perf) investigate zeroing a pre-allocated allocated area in |
302 | | // order to further improve Sync() performance. |
303 | | class PosixWritableFile : public WritableFile { |
304 | | public: |
305 | | PosixWritableFile(const std::string& fname, int fd, uint64_t file_size, |
306 | | bool sync_on_close) |
307 | | : filename_(fname), |
308 | | fd_(fd), |
309 | | sync_on_close_(sync_on_close), |
310 | | filesize_(file_size), |
311 | | pre_allocated_size_(0), |
312 | 2.32M | pending_sync_(false) {} |
313 | | |
314 | 2.25M | ~PosixWritableFile() { |
315 | 2.25M | if (fd_ >= 0) { |
316 | 11.4k | WARN_NOT_OK(Close(), "Failed to close " + filename_); |
317 | 11.4k | } |
318 | 2.25M | } |
319 | | |
320 | 4.51M | Status Append(const Slice& data) override { |
321 | 4.51M | vector<Slice> data_vector; |
322 | 4.51M | data_vector.push_back(data); |
323 | 4.51M | return AppendVector(data_vector); |
324 | 4.51M | } |
325 | | |
326 | 29.5M | Status AppendSlices(const Slice* slices, size_t num) override { |
327 | 29.5M | ThreadRestrictions::AssertIOAllowed(); |
328 | 29.5M | static const size_t kIovMaxElements = IOV_MAX; |
329 | | |
330 | 29.5M | Status s; |
331 | 59.1M | for (size_t i = 0; i < num && s.ok()29.5M ; i += kIovMaxElements29.5M ) { |
332 | 29.5M | size_t n = std::min(num - i, kIovMaxElements); |
333 | 29.5M | s = DoWritev(slices + i, n); |
334 | 29.5M | } |
335 | | |
336 | 29.5M | pending_sync_ = true; |
337 | 29.5M | return s; |
338 | 29.5M | } |
339 | | |
340 | 160k | Status PreAllocate(uint64_t size) override { |
341 | 160k | TRACE_EVENT1("io", "PosixWritableFile::PreAllocate", "path", filename_); |
342 | 160k | ThreadRestrictions::AssertIOAllowed(); |
343 | 160k | uint64_t offset = std::max(filesize_, pre_allocated_size_); |
344 | 160k | if (PREDICT_FALSE(FLAGS_TEST_simulate_fs_without_fallocate)) { |
345 | 0 | YB_LOG_FIRST_N(WARNING, 1) << "Simulating a filesystem without fallocate() support"; |
346 | 0 | return Status::OK(); |
347 | 0 | } |
348 | 160k | if (fallocate(fd_, 0, offset, size) < 0) { |
349 | 0 | if (errno == EOPNOTSUPP) { |
350 | 0 | YB_LOG_FIRST_N(WARNING, 1) << "The filesystem does not support fallocate()."; |
351 | 0 | } else if (errno == ENOSYS) { |
352 | 0 | YB_LOG_FIRST_N(WARNING, 1) << "The kernel does not implement fallocate()."; |
353 | 0 | } else { |
354 | 0 | return STATUS_IO_ERROR(filename_, errno); |
355 | 0 | } |
356 | | // We don't want to modify pre_allocated_size_ since nothing was pre-allocated. |
357 | 0 | return Status::OK(); |
358 | 0 | } |
359 | 160k | pre_allocated_size_ = offset + size; |
360 | 160k | return Status::OK(); |
361 | 160k | } |
362 | | |
363 | 2.25M | Status Close() override { |
364 | 2.25M | TRACE_EVENT1("io", "PosixWritableFile::Close", "path", filename_); |
365 | 2.25M | ThreadRestrictions::AssertIOAllowed(); |
366 | 2.25M | Status s; |
367 | | |
368 | | // If we've allocated more space than we used, truncate to the |
369 | | // actual size of the file and perform Sync(). |
370 | 2.25M | if (filesize_ < pre_allocated_size_) { |
371 | 82.5k | if (ftruncate(fd_, filesize_) < 0) { |
372 | 0 | s = STATUS_IO_ERROR(filename_, errno); |
373 | 0 | pending_sync_ = true; |
374 | 0 | } |
375 | 82.5k | } |
376 | | |
377 | 2.25M | if (sync_on_close_) { |
378 | 96.7k | Status sync_status = Sync(); |
379 | 96.7k | if (!sync_status.ok()) { |
380 | 0 | LOG(ERROR) << "Unable to Sync " << filename_ << ": " << sync_status.ToString(); |
381 | 0 | if (s.ok()) { |
382 | 0 | s = sync_status; |
383 | 0 | } |
384 | 0 | } |
385 | 96.7k | } |
386 | | |
387 | 2.25M | if (close(fd_) < 0) { |
388 | 0 | if (s.ok()) { |
389 | 0 | s = STATUS_IO_ERROR(filename_, errno); |
390 | 0 | } |
391 | 0 | } |
392 | | |
393 | 2.25M | fd_ = -1; |
394 | 2.25M | return s; |
395 | 2.25M | } |
396 | | |
397 | 1 | Status Flush(FlushMode mode) override { |
398 | 1 | TRACE_EVENT1("io", "PosixWritableFile::Flush", "path", filename_); |
399 | 1 | ThreadRestrictions::AssertIOAllowed(); |
400 | 1 | if (FLAGS_never_fsync) { |
401 | 1 | return Status::OK(); |
402 | 1 | } |
403 | | #if defined(__linux__) |
404 | | int flags = SYNC_FILE_RANGE_WRITE; |
405 | | if (mode == FLUSH_SYNC) { |
406 | | flags |= SYNC_FILE_RANGE_WAIT_AFTER; |
407 | | } |
408 | | if (sync_file_range(fd_, 0, 0, flags) < 0) { |
409 | | return STATUS_IO_ERROR(filename_, errno); |
410 | | } |
411 | | #else |
412 | 0 | if (fsync(fd_) < 0) { |
413 | 0 | return STATUS_IO_ERROR(filename_, errno); |
414 | 0 | } |
415 | 0 | #endif |
416 | 0 | return Status::OK(); |
417 | 0 | } |
418 | | |
419 | 2.75M | Status Sync() override { |
420 | 2.75M | TRACE_EVENT1("io", "PosixWritableFile::Sync", "path", filename_); |
421 | 2.75M | ThreadRestrictions::AssertIOAllowed(); |
422 | 2.75M | LOG_SLOW_EXECUTION2.75M (WARNING, 1000, Substitute("sync call for $0", filename_)) { |
423 | 2.75M | if (pending_sync_) { |
424 | 2.74M | pending_sync_ = false; |
425 | 2.74M | RETURN_NOT_OK(DoSync(fd_, filename_)); |
426 | 2.74M | } |
427 | 2.75M | } |
428 | 2.75M | return Status::OK(); |
429 | 2.75M | } |
430 | | |
431 | 25.1M | uint64_t Size() const override { |
432 | 25.1M | return filesize_; |
433 | 25.1M | } |
434 | | |
435 | 11 | const string& filename() const override { return filename_; } |
436 | | |
437 | | protected: |
438 | | const std::string filename_; |
439 | | int fd_; |
440 | | bool sync_on_close_; |
441 | | uint64_t filesize_; |
442 | | uint64_t pre_allocated_size_; |
443 | | bool pending_sync_; |
444 | | |
445 | | private: |
446 | 29.5M | Status DoWritev(const Slice* slices, size_t n) { |
447 | 29.5M | ThreadRestrictions::AssertIOAllowed(); |
448 | 29.5M | DCHECK_LE(n, IOV_MAX); |
449 | | |
450 | 29.5M | struct iovec iov[n]; |
451 | 29.5M | ssize_t nbytes = 0; |
452 | | |
453 | 84.2M | for (size_t i = 0; i < n; ++i54.6M ) { |
454 | 54.6M | const Slice& data = slices[i]; |
455 | 54.6M | iov[i].iov_base = const_cast<uint8_t*>(data.data()); |
456 | 54.6M | iov[i].iov_len = data.size(); |
457 | 54.6M | nbytes += data.size(); |
458 | 54.6M | } |
459 | | |
460 | 29.5M | ssize_t written = writev(fd_, iov, narrow_cast<int>(n)); |
461 | | |
462 | 29.5M | if (PREDICT_FALSE(written == -1)) { |
463 | 0 | int err = errno; |
464 | 0 | return STATUS_IO_ERROR(filename_, err); |
465 | 0 | } |
466 | | |
467 | 29.5M | filesize_ += written; |
468 | | |
469 | 29.5M | if (PREDICT_FALSE(written != nbytes)) { |
470 | 0 | return STATUS_FORMAT( |
471 | 0 | IOError, "writev error: expected to write $0 bytes, wrote $1 bytes instead", |
472 | 0 | nbytes, written); |
473 | 0 | } |
474 | | |
475 | 29.5M | return Status::OK(); |
476 | 29.5M | } |
477 | | }; |
478 | | |
479 | | #if defined(__linux__) |
480 | | class PosixDirectIOWritableFile final : public PosixWritableFile { |
481 | | public: |
482 | | PosixDirectIOWritableFile(const std::string &fname, int fd, uint64_t file_size, |
483 | | bool sync_on_close) |
484 | | : PosixWritableFile(fname, fd, file_size, false /* sync_on_close */) { |
485 | | |
486 | | if (file_size != 0) { |
487 | | // For now, we don't support appending to an already existing file (of non-zero size). |
488 | | // TODO(hector): If file size is not a multiple of block_size_, read the |
489 | | // last aligned block. |
490 | | LOG(FATAL) << "file size != 0"; |
491 | | } |
492 | | |
493 | | next_write_offset_ = 0; |
494 | | last_block_used_bytes_ = 0; |
495 | | block_size_ = FLAGS_o_direct_block_size_bytes; |
496 | | last_block_idx_ = 0; |
497 | | has_new_data_ = false; |
498 | | real_size_ = 0; |
499 | | CHECK_GE(block_size_, 512); |
500 | | } |
501 | | |
502 | | ~PosixDirectIOWritableFile() { |
503 | | if (fd_ >= 0) { |
504 | | WARN_NOT_OK(Close(), "Failed to close " + filename_); |
505 | | } |
506 | | } |
507 | | |
508 | | Status Append(const Slice &const_data_slice) override { |
509 | | ThreadRestrictions::AssertIOAllowed(); |
510 | | Slice data_slice = const_data_slice; |
511 | | |
512 | | while (data_slice.size() > 0) { |
513 | | size_t max_data = IOV_MAX * block_size_ - BufferedByteCount(); |
514 | | CHECK_GT(IOV_MAX, 0); |
515 | | CHECK_GT(IOV_MAX * block_size_, BufferedByteCount()); |
516 | | CHECK_GT(max_data, 0); |
517 | | const auto data = Slice(data_slice.data(), std::min(data_slice.size(), max_data)); |
518 | | |
519 | | RETURN_NOT_OK(MaybeAllocateMemory(data.size())); |
520 | | RETURN_NOT_OK(WriteToBuffer(data)); |
521 | | |
522 | | if (data_slice.size() >= max_data) { |
523 | | data_slice.remove_prefix(max_data); |
524 | | RETURN_NOT_OK(Sync()); |
525 | | } else { |
526 | | break; |
527 | | } |
528 | | } |
529 | | real_size_ += const_data_slice.size(); |
530 | | return Status::OK(); |
531 | | } |
532 | | |
533 | | Status AppendSlices(const Slice* slices, size_t num) override { |
534 | | ThreadRestrictions::AssertIOAllowed(); |
535 | | for (auto end = slices + num; slices != end; ++slices) { |
536 | | RETURN_NOT_OK(Append(*slices)); |
537 | | } |
538 | | return Status::OK(); |
539 | | } |
540 | | |
541 | | Status Close() override { |
542 | | TRACE_EVENT1("io", "PosixDirectIOWritableFile::Close", "path", filename_); |
543 | | ThreadRestrictions::AssertIOAllowed(); |
544 | | RETURN_NOT_OK(Sync()); |
545 | | LOG(INFO) << "Closing file " << filename_ << " with " << block_ptr_vec_.size() << " blocks"; |
546 | | off_t fsize; |
547 | | fsize = lseek(fd_, 0, SEEK_END); |
548 | | CHECK_EQ(fsize, std::max(filesize_, pre_allocated_size_)); |
549 | | |
550 | | if (real_size_ < filesize_ || real_size_ < pre_allocated_size_) { |
551 | | LOG(INFO) << filename_ << ": Truncating file from size: " << filesize_ |
552 | | << " to size: " << real_size_ |
553 | | << ". Preallocated size: " << pre_allocated_size_; |
554 | | if (ftruncate(fd_, real_size_) != 0) { |
555 | | return STATUS_IO_ERROR(filename_, errno); |
556 | | } |
557 | | } |
558 | | |
559 | | if (close(fd_) < 0) { |
560 | | return STATUS_IO_ERROR(filename_, errno); |
561 | | } |
562 | | |
563 | | fd_ = -1; |
564 | | return Status::OK(); |
565 | | } |
566 | | |
567 | | Status Flush(FlushMode mode) override { |
568 | | ThreadRestrictions::AssertIOAllowed(); |
569 | | return Sync(); |
570 | | } |
571 | | |
572 | | Status Sync() override { |
573 | | ThreadRestrictions::AssertIOAllowed(); |
574 | | return DoWrite(); |
575 | | } |
576 | | |
577 | | uint64_t Size() const override { |
578 | | return real_size_; |
579 | | } |
580 | | |
581 | | private: |
582 | | // The number of bytes buffered so far (that will be written out as part of the next write). |
583 | | size_t BufferedByteCount() { |
584 | | return last_block_idx_ * block_size_ + last_block_used_bytes_; |
585 | | } |
586 | | |
587 | | Status WriteToBuffer(Slice data) { |
588 | | auto last_block_used_bytes = last_block_used_bytes_; |
589 | | auto last_block_idx = last_block_idx_; |
590 | | auto total_bytes_cached = BufferedByteCount(); |
591 | | auto request_size = data.size(); |
592 | | |
593 | | CHECK_GT(data.size(), 0); |
594 | | // Used only for the first block. Reset to 0 after the first memcpy. |
595 | | size_t block_offset = last_block_used_bytes_; |
596 | | auto i = last_block_idx_; |
597 | | if (last_block_used_bytes_ == block_size_) { |
598 | | // Start writing in a new block if the last block is full. |
599 | | i++; |
600 | | block_offset = 0; |
601 | | } |
602 | | while (data.size() > 0) { |
603 | | last_block_used_bytes_ = block_size_; |
604 | | size_t block_data_size = std::min(data.size(), block_size_ - block_offset); |
605 | | if (block_data_size + block_offset < block_size_) { |
606 | | // Writing the last block. |
607 | | last_block_used_bytes_ = block_data_size + block_offset; |
608 | | memset(&block_ptr_vec_[i].get()[last_block_used_bytes_], 0, |
609 | | block_size_ - last_block_used_bytes_); |
610 | | } |
611 | | CHECK(i < block_ptr_vec_.size()); |
612 | | memcpy(&block_ptr_vec_[i].get()[block_offset], data.data(), block_data_size); |
613 | | block_offset = 0; |
614 | | data.remove_prefix(block_data_size); |
615 | | last_block_idx_ = i++; |
616 | | has_new_data_ = true; |
617 | | } |
618 | | |
619 | | CHECK_GE(last_block_idx_, last_block_idx); |
620 | | if (last_block_idx_ == last_block_idx) { |
621 | | CHECK_GE(last_block_used_bytes_, last_block_used_bytes); |
622 | | } |
623 | | CHECK_EQ(BufferedByteCount(), total_bytes_cached + request_size); |
624 | | |
625 | | return Status::OK(); |
626 | | } |
627 | | |
628 | | Status DoWrite() { |
629 | | if (!has_new_data_) { |
630 | | return Status::OK(); |
631 | | } |
632 | | CHECK_LE(last_block_used_bytes_, block_size_); |
633 | | CHECK_LT(last_block_idx_, block_ptr_vec_.size()); |
634 | | auto blocks_to_write = last_block_idx_ + 1; |
635 | | CHECK_LE(blocks_to_write, IOV_MAX); |
636 | | |
637 | | struct iovec iov[blocks_to_write]; |
638 | | for (size_t j = 0; j < blocks_to_write; j++) { |
639 | | iov[j].iov_base = block_ptr_vec_[j].get(); |
640 | | iov[j].iov_len = block_size_; |
641 | | } |
642 | | ssize_t bytes_to_write = blocks_to_write * block_size_; |
643 | | ssize_t written = pwritev(fd_, iov, narrow_cast<int>(blocks_to_write), next_write_offset_); |
644 | | |
645 | | if (PREDICT_FALSE(written == -1)) { |
646 | | int err = errno; |
647 | | return STATUS_IO_ERROR(filename_, err); |
648 | | } |
649 | | |
650 | | if (PREDICT_FALSE(written != bytes_to_write)) { |
651 | | return STATUS(IOError, |
652 | | Substitute("pwritev error: expected to write $0 bytes, wrote $1 bytes instead", |
653 | | bytes_to_write, written)); |
654 | | } |
655 | | |
656 | | filesize_ = next_write_offset_ + written; |
657 | | CHECK_EQ(filesize_, align_up(filesize_, block_size_)); |
658 | | |
659 | | next_write_offset_ = filesize_; |
660 | | |
661 | | if (last_block_used_bytes_ != block_size_) { |
662 | | // Next write will happen at filesize_ - block_size_ offset in the file if the last block is |
663 | | // not full. |
664 | | next_write_offset_ -= block_size_; |
665 | | |
666 | | // Since the last block is only partially full, make it the first block so we can append to |
667 | | // it in the next call. |
668 | | if (last_block_idx_ > 0) { |
669 | | std::swap(block_ptr_vec_[0], block_ptr_vec_[last_block_idx_]); |
670 | | } |
671 | | |
672 | | } else { |
673 | | last_block_used_bytes_ = 0; |
674 | | } |
675 | | last_block_idx_ = 0; |
676 | | has_new_data_ = false; |
677 | | return Status::OK(); |
678 | | } |
679 | | |
680 | | Status MaybeAllocateMemory(size_t data_size) { |
681 | | auto buffered_data_size = last_block_idx_ * block_size_ + last_block_used_bytes_; |
682 | | auto bytes_to_write = align_up(buffered_data_size + data_size, block_size_); |
683 | | auto blocks_to_write = bytes_to_write / block_size_; |
684 | | |
685 | | if (blocks_to_write > block_ptr_vec_.size()) { |
686 | | auto nblocks = blocks_to_write - block_ptr_vec_.size(); |
687 | | for (size_t i = 0; i < nblocks; i++) { |
688 | | void *temp_buf = nullptr; |
689 | | auto err = posix_memalign(&temp_buf, FLAGS_o_direct_block_alignment_bytes, block_size_); |
690 | | if (err) { |
691 | | return STATUS(RuntimeError, "Unable to allocate memory", Errno(err)); |
692 | | } |
693 | | |
694 | | uint8_t *start = static_cast<uint8_t *>(temp_buf); |
695 | | block_ptr_vec_.push_back(std::shared_ptr<uint8_t>(start, [](uint8_t *p) { free(p); })); |
696 | | } |
697 | | |
698 | | CHECK_EQ(block_ptr_vec_.size() * block_size_, bytes_to_write); |
699 | | } |
700 | | return Status::OK(); |
701 | | } |
702 | | |
703 | | size_t next_write_offset_; |
704 | | vector<std::shared_ptr<uint8_t>> block_ptr_vec_; |
705 | | size_t last_block_used_bytes_; |
706 | | size_t last_block_idx_; |
707 | | size_t block_size_; |
708 | | bool has_new_data_; |
709 | | size_t real_size_; |
710 | | }; |
711 | | #endif |
712 | | |
713 | | class PosixRWFile final : public RWFile { |
714 | | // is not employed. |
715 | | public: |
716 | | PosixRWFile(string fname, int fd, bool sync_on_close) |
717 | | : filename_(std::move(fname)), |
718 | | fd_(fd), |
719 | | sync_on_close_(sync_on_close), |
720 | 5 | pending_sync_(false) {} |
721 | | |
722 | 5 | ~PosixRWFile() { |
723 | 5 | if (fd_ >= 0) { |
724 | | // Virtual method call in destructor. |
725 | 3 | WARN_NOT_OK(Close(), "Failed to close " + filename_); |
726 | 3 | } |
727 | 5 | } |
728 | | |
729 | | virtual Status Read(uint64_t offset, size_t length, |
730 | 3 | Slice* result, uint8_t* scratch) const override { |
731 | 3 | ThreadRestrictions::AssertIOAllowed(); |
732 | 3 | auto rem = length; |
733 | 3 | uint8_t* dst = scratch; |
734 | 6 | while (rem > 0) { |
735 | 3 | ssize_t r = pread(fd_, dst, rem, offset); |
736 | 3 | if (r < 0) { |
737 | | // An error: return a non-ok status. |
738 | 0 | return STATUS_IO_ERROR(filename_, errno); |
739 | 0 | } |
740 | 3 | Slice this_result(dst, r); |
741 | 3 | DCHECK_LE(this_result.size(), rem); |
742 | 3 | if (this_result.size() == 0) { |
743 | | // EOF |
744 | 0 | return STATUS_FORMAT(IOError, "EOF trying to read $0 bytes at offset $1", length, offset); |
745 | 0 | } |
746 | 3 | dst += this_result.size(); |
747 | 3 | rem -= this_result.size(); |
748 | 3 | offset += this_result.size(); |
749 | 3 | } |
750 | 3 | DCHECK_EQ(0, rem); |
751 | 3 | *result = Slice(scratch, length); |
752 | 3 | return Status::OK(); |
753 | 3 | } |
754 | | |
755 | 6 | Status Write(uint64_t offset, const Slice& data) override { |
756 | 6 | ThreadRestrictions::AssertIOAllowed(); |
757 | 6 | ssize_t written = pwrite(fd_, data.data(), data.size(), offset); |
758 | | |
759 | 6 | if (PREDICT_FALSE(written == -1)) { |
760 | 0 | int err = errno; |
761 | 0 | return STATUS_IO_ERROR(filename_, err); |
762 | 0 | } |
763 | | |
764 | 6 | if (PREDICT_FALSE(written != implicit_cast<ssize_t>(data.size()))) { |
765 | 0 | return STATUS(IOError, |
766 | 0 | Substitute("pwrite error: expected to write $0 bytes, wrote $1 bytes instead", |
767 | 0 | data.size(), written)); |
768 | 0 | } |
769 | | |
770 | 6 | pending_sync_ = true; |
771 | 6 | return Status::OK(); |
772 | 6 | } |
773 | | |
774 | 0 | Status PreAllocate(uint64_t offset, size_t length) override { |
775 | 0 | TRACE_EVENT1("io", "PosixRWFile::PreAllocate", "path", filename_); |
776 | 0 | ThreadRestrictions::AssertIOAllowed(); |
777 | 0 | if (fallocate(fd_, 0, offset, length) < 0) { |
778 | 0 | if (errno == EOPNOTSUPP) { |
779 | 0 | YB_LOG_FIRST_N(WARNING, 1) << "The filesystem does not support fallocate()."; |
780 | 0 | } else if (errno == ENOSYS) { |
781 | 0 | YB_LOG_FIRST_N(WARNING, 1) << "The kernel does not implement fallocate()."; |
782 | 0 | } else { |
783 | 0 | return STATUS_IO_ERROR(filename_, errno); |
784 | 0 | } |
785 | 0 | } |
786 | 0 | return Status::OK(); |
787 | 0 | } |
788 | | |
789 | 0 | Status PunchHole(uint64_t offset, size_t length) override { |
790 | | #if defined(__linux__) |
791 | | TRACE_EVENT1("io", "PosixRWFile::PunchHole", "path", filename_); |
792 | | ThreadRestrictions::AssertIOAllowed(); |
793 | | if (fallocate(fd_, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length) < 0) { |
794 | | return STATUS_IO_ERROR(filename_, errno); |
795 | | } |
796 | | return Status::OK(); |
797 | | #else |
798 | 0 | return STATUS(NotSupported, "Hole punching not supported on this platform"); |
799 | 0 | #endif |
800 | 0 | } |
801 | | |
802 | 0 | Status Flush(FlushMode mode, uint64_t offset, size_t length) override { |
803 | 0 | TRACE_EVENT1("io", "PosixRWFile::Flush", "path", filename_); |
804 | 0 | ThreadRestrictions::AssertIOAllowed(); |
805 | 0 | if (FLAGS_never_fsync) { |
806 | 0 | return Status::OK(); |
807 | 0 | } |
808 | | #if defined(__linux__) |
809 | | int flags = SYNC_FILE_RANGE_WRITE; |
810 | | if (mode == FLUSH_SYNC) { |
811 | | flags |= SYNC_FILE_RANGE_WAIT_AFTER; |
812 | | } |
813 | | if (sync_file_range(fd_, offset, length, flags) < 0) { |
814 | | return STATUS_IO_ERROR(filename_, errno); |
815 | | } |
816 | | #else |
817 | 0 | if (fsync(fd_) < 0) { |
818 | 0 | return STATUS_IO_ERROR(filename_, errno); |
819 | 0 | } |
820 | 0 | #endif |
821 | 0 | return Status::OK(); |
822 | 0 | } |
823 | | |
824 | 3 | Status Sync() override { |
825 | 3 | TRACE_EVENT1("io", "PosixRWFile::Sync", "path", filename_); |
826 | 3 | ThreadRestrictions::AssertIOAllowed(); |
827 | 3 | LOG_SLOW_EXECUTION(WARNING, 1000, Substitute("sync call for $0", filename())) { |
828 | 3 | if (pending_sync_) { |
829 | 2 | pending_sync_ = false; |
830 | 2 | RETURN_NOT_OK(DoSync(fd_, filename_)); |
831 | 2 | } |
832 | 3 | } |
833 | 3 | return Status::OK(); |
834 | 3 | } |
835 | | |
836 | 5 | Status Close() override { |
837 | 5 | TRACE_EVENT1("io", "PosixRWFile::Close", "path", filename_); |
838 | 5 | ThreadRestrictions::AssertIOAllowed(); |
839 | 5 | Status s; |
840 | | |
841 | 5 | if (sync_on_close_) { |
842 | | // Virtual function call in destructor. |
843 | 3 | s = Sync(); |
844 | 3 | if (!s.ok()) { |
845 | 0 | LOG(ERROR) << "Unable to Sync " << filename_ << ": " << s.ToString(); |
846 | 0 | } |
847 | 3 | } |
848 | | |
849 | 5 | if (close(fd_) < 0) { |
850 | 0 | if (s.ok()) { |
851 | 0 | s = STATUS_IO_ERROR(filename_, errno); |
852 | 0 | } |
853 | 0 | } |
854 | | |
855 | 5 | fd_ = -1; |
856 | 5 | return s; |
857 | 5 | } |
858 | | |
859 | 2 | Status Size(uint64_t* size) const override { |
860 | 2 | TRACE_EVENT1("io", "PosixRWFile::Size", "path", filename_); |
861 | 2 | ThreadRestrictions::AssertIOAllowed(); |
862 | 2 | struct stat st; |
863 | 2 | if (fstat(fd_, &st) == -1) { |
864 | 0 | return STATUS_IO_ERROR(filename_, errno); |
865 | 0 | } |
866 | 2 | *size = st.st_size; |
867 | 2 | return Status::OK(); |
868 | 2 | } |
869 | | |
870 | 3 | const string& filename() const override { |
871 | 3 | return filename_; |
872 | 3 | } |
873 | | |
874 | | private: |
875 | | const std::string filename_; |
876 | | int fd_; |
877 | | bool sync_on_close_; |
878 | | bool pending_sync_; |
879 | | }; |
880 | | |
881 | | // Set of pathnames that are locked, and a mutex for protecting changes to the set. |
882 | | static std::set<std::string> locked_files; |
883 | | static std::mutex mutex_locked_files; |
884 | | |
885 | | // Helper function to lock/unlock file whose filename and file descriptor are passed in. Returns -1 |
886 | | // on failure and a value other than -1 (as mentioned in fcntl() doc) on success. |
887 | | static int LockOrUnlock(const std::string& fname, |
888 | | int fd, |
889 | | bool lock, |
890 | 17.4k | bool recursive_lock_ok) { |
891 | 17.4k | std::lock_guard<std::mutex> guard(mutex_locked_files); |
892 | 17.4k | if (lock) { |
893 | | // If recursive locks on the same file must be disallowed, but the specified file name already |
894 | | // exists in the locked_files set, then it is already locked, so we fail this lock attempt. |
895 | | // Otherwise, we insert the specified file name into locked_files. This check is needed because |
896 | | // fcntl() does not detect lock conflict if the fcntl is issued by the same thread that earlier |
897 | | // acquired this lock. |
898 | 17.4k | if (!locked_files.insert(fname).second && !recursive_lock_ok3 ) { |
899 | 3 | errno = ENOLCK; |
900 | 3 | return -1; |
901 | 3 | } |
902 | 17.4k | } else { |
903 | | // If we are unlocking, then verify that we had locked it earlier, it should already exist in |
904 | | // locked_files. Remove it from locked_files. |
905 | 0 | if (locked_files.erase(fname) != 1) { |
906 | 0 | errno = ENOLCK; |
907 | 0 | return -1; |
908 | 0 | } |
909 | 0 | } |
910 | 17.4k | errno = 0; |
911 | 17.4k | struct flock f; |
912 | 17.4k | memset(&f, 0, sizeof(f)); |
913 | 17.4k | f.l_type = (lock ? F_WRLCK : F_UNLCK); |
914 | 17.4k | f.l_whence = SEEK_SET; |
915 | 17.4k | f.l_start = 0; |
916 | 17.4k | f.l_len = 0; // Lock/unlock entire file. |
917 | 17.4k | int value = fcntl(fd, F_SETLK, &f); |
918 | 17.4k | if (value == -1 && lock0 ) { |
919 | | // If there is an error in locking, then remove the pathname from locked_files. |
920 | 0 | locked_files.erase(fname); |
921 | 0 | } |
922 | 17.4k | return value; |
923 | 17.4k | } |
924 | | |
925 | | class PosixFileLock : public FileLock { |
926 | | public: |
927 | | int fd_; |
928 | | std::string filename; |
929 | | }; |
930 | | |
931 | | class PosixEnv : public Env { |
932 | | public: |
933 | | PosixEnv(); |
934 | | explicit PosixEnv(std::unique_ptr<FileFactory> file_factory); |
935 | 155 | virtual ~PosixEnv() = default; |
936 | | |
937 | | virtual Status NewSequentialFile(const std::string& fname, |
938 | 4.10k | std::unique_ptr<SequentialFile>* result) override { |
939 | 4.10k | return file_factory_->NewSequentialFile(fname, result); |
940 | 4.10k | } |
941 | | |
942 | | virtual Status NewRandomAccessFile(const std::string& fname, |
943 | 873k | std::unique_ptr<RandomAccessFile>* result) override { |
944 | 873k | return file_factory_->NewRandomAccessFile(fname, result); |
945 | 873k | } |
946 | | |
947 | | virtual Status NewWritableFile(const std::string& fname, |
948 | 2.33k | std::unique_ptr<WritableFile>* result) override { |
949 | 2.33k | return file_factory_->NewWritableFile(fname, result); |
950 | 2.33k | } |
951 | | |
952 | | virtual Status NewWritableFile(const WritableFileOptions& opts, |
953 | | const std::string& fname, |
954 | 11.4k | std::unique_ptr<WritableFile>* result) override { |
955 | 11.4k | return file_factory_->NewWritableFile(opts, fname, result); |
956 | 11.4k | } |
957 | | |
958 | | virtual Status NewTempWritableFile(const WritableFileOptions& opts, |
959 | | const std::string& name_template, |
960 | | std::string* created_filename, |
961 | 2.31M | std::unique_ptr<WritableFile>* result) override { |
962 | 2.31M | return file_factory_->NewTempWritableFile(opts, name_template, created_filename, result); |
963 | 2.31M | } |
964 | | |
965 | | virtual Status NewRWFile(const string& fname, |
966 | 1 | std::unique_ptr<RWFile>* result) override { |
967 | 1 | return file_factory_->NewRWFile(fname, result); |
968 | 1 | } |
969 | | |
970 | | virtual Status NewRWFile(const RWFileOptions& opts, |
971 | | const string& fname, |
972 | 5 | std::unique_ptr<RWFile>* result) override { |
973 | 5 | return file_factory_->NewRWFile(opts, fname, result); |
974 | 5 | } |
975 | | |
976 | 1.96M | bool FileExists(const std::string& fname) override { |
977 | 1.96M | TRACE_EVENT1("io", "PosixEnv::FileExists", "path", fname); |
978 | 1.96M | ThreadRestrictions::AssertIOAllowed(); |
979 | 1.96M | return access(fname.c_str(), F_OK) == 0; |
980 | 1.96M | } |
981 | | |
982 | 10 | virtual bool DirExists(const std::string& dname) override { |
983 | 10 | TRACE_EVENT1("io", "PosixEnv::DirExists", "path", dname); |
984 | 10 | ThreadRestrictions::AssertIOAllowed(); |
985 | 10 | struct stat statbuf; |
986 | 10 | if (stat(dname.c_str(), &statbuf) == 0) { |
987 | 7 | return S_ISDIR(statbuf.st_mode); |
988 | 7 | } |
989 | 3 | return false; |
990 | 10 | } |
991 | | |
992 | | CHECKED_STATUS GetChildren(const std::string& dir, |
993 | | ExcludeDots exclude_dots, |
994 | 327k | std::vector<std::string>* result) override { |
995 | 327k | TRACE_EVENT1("io", "PosixEnv::GetChildren", "path", dir); |
996 | 327k | ThreadRestrictions::AssertIOAllowed(); |
997 | 327k | result->clear(); |
998 | 327k | DIR* d = opendir(dir.c_str()); |
999 | 327k | if (d == nullptr) { |
1000 | 0 | return STATUS_IO_ERROR(dir, errno); |
1001 | 0 | } |
1002 | 327k | struct dirent* entry; |
1003 | | // TODO: lint: Consider using readdir_r(...) instead of readdir(...) for improved thread safety. |
1004 | 1.01M | while ((entry = readdir(d)) != nullptr) { |
1005 | 688k | if (exclude_dots && (350k strcmp(entry->d_name, ".") == 0350k || strcmp(entry->d_name, "..") == 0185k )) { |
1006 | 330k | continue; |
1007 | 330k | } |
1008 | 357k | result->push_back(entry->d_name); |
1009 | 357k | } |
1010 | 327k | closedir(d); |
1011 | 327k | return Status::OK(); |
1012 | 327k | } |
1013 | | |
1014 | 393k | Status DeleteFile(const std::string& fname) override { |
1015 | 393k | TRACE_EVENT1("io", "PosixEnv::DeleteFile", "path", fname); |
1016 | 393k | ThreadRestrictions::AssertIOAllowed(); |
1017 | 393k | Status result; |
1018 | 393k | if (unlink(fname.c_str()) != 0) { |
1019 | 3.23k | result = STATUS_IO_ERROR(fname, errno); |
1020 | 3.23k | } |
1021 | 393k | return result; |
1022 | 393k | }; |
1023 | | |
1024 | 3.01M | Status CreateDir(const std::string& name) override { |
1025 | 3.01M | TRACE_EVENT1("io", "PosixEnv::CreateDir", "path", name); |
1026 | 3.01M | ThreadRestrictions::AssertIOAllowed(); |
1027 | 3.01M | Status result; |
1028 | 3.01M | if (mkdir(name.c_str(), 0755) != 0) { |
1029 | 2.08M | result = STATUS_IO_ERROR(name, errno); |
1030 | 2.08M | } |
1031 | 3.01M | return result; |
1032 | 3.01M | }; |
1033 | | |
1034 | 204k | Status DeleteDir(const std::string& name) override { |
1035 | 204k | TRACE_EVENT1("io", "PosixEnv::DeleteDir", "path", name); |
1036 | 204k | ThreadRestrictions::AssertIOAllowed(); |
1037 | 204k | Status result; |
1038 | 204k | if (rmdir(name.c_str()) != 0) { |
1039 | 0 | result = STATUS_IO_ERROR(name, errno); |
1040 | 0 | } |
1041 | 204k | return result; |
1042 | 204k | }; |
1043 | | |
1044 | 3.43M | Status SyncDir(const std::string& dirname) override { |
1045 | 3.43M | TRACE_EVENT1("io", "SyncDir", "path", dirname); |
1046 | 3.43M | ThreadRestrictions::AssertIOAllowed(); |
1047 | 3.43M | if (FLAGS_never_fsync) return Status::OK()3.43M ; |
1048 | 16 | int dir_fd; |
1049 | 16 | if ((dir_fd = open(dirname.c_str(), O_DIRECTORY|O_RDONLY)) == -1) { |
1050 | 0 | return STATUS_IO_ERROR(dirname, errno); |
1051 | 0 | } |
1052 | 16 | ScopedFdCloser fd_closer(dir_fd); |
1053 | 16 | if (fsync(dir_fd) != 0) { |
1054 | 0 | return STATUS_IO_ERROR(dirname, errno); |
1055 | 0 | } |
1056 | 16 | return Status::OK(); |
1057 | 16 | } |
1058 | | |
1059 | 155k | Status DeleteRecursively(const std::string &name) override { |
1060 | 155k | return Walk(name, POST_ORDER, std::bind(&PosixEnv::DeleteRecursivelyCb, this, _1, _2, _3)); |
1061 | 155k | } |
1062 | | |
1063 | 834k | Result<uint64_t> GetFileSize(const std::string& fname) override { |
1064 | 834k | return file_factory_->GetFileSize(fname); |
1065 | 834k | } |
1066 | | |
1067 | 6.66k | Result<uint64_t> GetFileINode(const std::string& fname) override { |
1068 | 6.66k | return GetFileStat( |
1069 | 6.66k | fname, "PosixEnv::GetFileINode", [](const struct stat& sbuf) { return sbuf.st_ino; }); |
1070 | 6.66k | } |
1071 | | |
1072 | 0 | Result<uint64_t> GetFileSizeOnDisk(const std::string& fname) override { |
1073 | 0 | return GetFileStat( |
1074 | 0 | fname, "PosixEnv::GetFileSizeOnDisk", [](const struct stat& sbuf) { |
1075 | | // From stat(2): |
1076 | | // |
1077 | | // The st_blocks field indicates the number of blocks allocated to |
1078 | | // the file, 512-byte units. (This may be smaller than st_size/512 |
1079 | | // when the file has holes.) |
1080 | 0 | return sbuf.st_blocks * 512; |
1081 | 0 | }); |
1082 | 0 | } |
1083 | | |
1084 | 3 | Result<uint64_t> GetBlockSize(const string& fname) override { |
1085 | 3 | return GetFileStat( |
1086 | 3 | fname, "PosixEnv::GetBlockSize", [](const struct stat& sbuf) { return sbuf.st_blksize; }2 ); |
1087 | 3 | } |
1088 | | |
1089 | | CHECKED_STATUS LinkFile(const std::string& src, |
1090 | 674 | const std::string& target) override { |
1091 | 674 | if (link(src.c_str(), target.c_str()) != 0) { |
1092 | 0 | if (errno == EXDEV) { |
1093 | 0 | return STATUS(NotSupported, "No cross FS links allowed"); |
1094 | 0 | } |
1095 | 0 | return STATUS_IO_ERROR(src, errno); |
1096 | 0 | } |
1097 | 674 | return Status::OK(); |
1098 | 674 | } |
1099 | | |
1100 | 444 | Result<std::string> ReadLink(const std::string& link) override { |
1101 | 444 | char buf[PATH_MAX]; |
1102 | 444 | const auto len = readlink(link.c_str(), buf, sizeof(buf)); |
1103 | 444 | if (len > -1) { |
1104 | 444 | return std::string(buf, buf + len); |
1105 | 444 | } |
1106 | 0 | return STATUS_IO_ERROR(link, errno); |
1107 | 444 | } |
1108 | | |
1109 | 2.27M | Status RenameFile(const std::string& src, const std::string& target) override { |
1110 | 2.27M | TRACE_EVENT2("io", "PosixEnv::RenameFile", "src", src, "dst", target); |
1111 | 2.27M | ThreadRestrictions::AssertIOAllowed(); |
1112 | 2.27M | Status result; |
1113 | 2.27M | if (rename(src.c_str(), target.c_str()) != 0) { |
1114 | 0 | result = STATUS_IO_ERROR(Format("Rename $0 => $1", src, target), errno); |
1115 | 0 | } |
1116 | 2.27M | return result; |
1117 | 2.27M | } |
1118 | | |
1119 | | virtual Status LockFile(const std::string& fname, |
1120 | | FileLock** lock, |
1121 | 29.5k | bool recursive_lock_ok) override { |
1122 | 29.5k | TRACE_EVENT1("io", "PosixEnv::LockFile", "path", fname); |
1123 | 29.5k | ThreadRestrictions::AssertIOAllowed(); |
1124 | 29.5k | *lock = nullptr; |
1125 | 29.5k | Status result; |
1126 | 29.5k | int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); |
1127 | 29.5k | if (fd < 0) { |
1128 | 12.0k | result = STATUS_IO_ERROR(fname, errno); |
1129 | 17.4k | } else if (LockOrUnlock(fname, fd, true /* lock */, recursive_lock_ok) == -1) { |
1130 | 3 | result = STATUS_IO_ERROR("lock " + fname, errno); |
1131 | 3 | close(fd); |
1132 | 17.4k | } else { |
1133 | 17.4k | auto my_lock = new PosixFileLock; |
1134 | 17.4k | my_lock->fd_ = fd; |
1135 | 17.4k | my_lock->filename = fname; |
1136 | 17.4k | *lock = my_lock; |
1137 | 17.4k | } |
1138 | 29.5k | return result; |
1139 | 29.5k | } |
1140 | | |
1141 | 0 | Status UnlockFile(FileLock* lock) override { |
1142 | 0 | TRACE_EVENT0("io", "PosixEnv::UnlockFile"); |
1143 | 0 | ThreadRestrictions::AssertIOAllowed(); |
1144 | 0 | PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock); |
1145 | 0 | Status result; |
1146 | 0 | if (LockOrUnlock(my_lock->filename, |
1147 | 0 | my_lock->fd_, |
1148 | 0 | false /* lock */, |
1149 | 0 | false /* recursive_lock_ok (unused when lock = false) */) == -1) { |
1150 | 0 | result = STATUS_IO_ERROR("unlock", errno); |
1151 | 0 | } |
1152 | 0 | close(my_lock->fd_); |
1153 | 0 | delete my_lock; |
1154 | 0 | return result; |
1155 | 0 | } |
1156 | | |
1157 | 3.75k | Status GetTestDirectory(std::string* result) override { |
1158 | 3.75k | string dir; |
1159 | 3.75k | const char* env = getenv("TEST_TMPDIR"); |
1160 | 3.75k | if (env && env[0] != '\0') { |
1161 | 3.75k | dir = env; |
1162 | 3.75k | } else { |
1163 | 0 | char buf[100]; |
1164 | 0 | snprintf(buf, sizeof(buf), "/tmp/ybtest-%d", static_cast<int>(geteuid())); |
1165 | 0 | dir = buf; |
1166 | 0 | } |
1167 | | // Directory may already exist |
1168 | 3.75k | WARN_NOT_OK(CreateDir(dir), "Create test dir failed"); |
1169 | | // /tmp may be a symlink, so canonicalize the path. |
1170 | 3.75k | return Canonicalize(dir, result); |
1171 | 3.75k | } |
1172 | | |
1173 | 10.8G | uint64_t gettid() override { |
1174 | | // Platform-independent thread ID. We can't use pthread_self here, |
1175 | | // because that function returns a totally opaque ID, which can't be |
1176 | | // compared via normal means. |
1177 | 10.8G | if (thread_local_id == 0) { |
1178 | 2.00M | thread_local_id = Barrier_AtomicIncrement(&cur_thread_local_id_, 1); |
1179 | 2.00M | } |
1180 | 10.8G | return thread_local_id; |
1181 | 10.8G | } |
1182 | | |
1183 | 29.7k | uint64_t NowMicros() override { |
1184 | 29.7k | struct timeval tv; |
1185 | 29.7k | gettimeofday(&tv, nullptr); |
1186 | 29.7k | return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec; |
1187 | 29.7k | } |
1188 | | |
1189 | 212k | uint64_t NowNanos() override { |
1190 | | #if defined(__linux__) || defined(OS_FREEBSD) |
1191 | | struct timespec ts; |
1192 | | clock_gettime(CLOCK_MONOTONIC, &ts); |
1193 | | return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec; |
1194 | | #elif defined(__MACH__) |
1195 | 212k | clock_serv_t cclock; |
1196 | 212k | mach_timespec_t ts; |
1197 | 212k | host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); |
1198 | 212k | clock_get_time(cclock, &ts); |
1199 | 212k | mach_port_deallocate(mach_task_self(), cclock); |
1200 | 212k | return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec; |
1201 | | #else |
1202 | | return std::chrono::duration_cast<std::chrono::nanoseconds>( |
1203 | | std::chrono::steady_clock::now().time_since_epoch()).count(); |
1204 | | #endif |
1205 | 212k | } |
1206 | | |
1207 | 0 | void SleepForMicroseconds(int micros) override { |
1208 | 0 | ThreadRestrictions::AssertWaitAllowed(); |
1209 | 0 | SleepFor(MonoDelta::FromMicroseconds(micros)); |
1210 | 0 | } |
1211 | | |
1212 | 47.0k | Status GetExecutablePath(string* path) override { |
1213 | 47.0k | uint32_t size = 64; |
1214 | 47.0k | size_t len = 0; |
1215 | 94.1k | while (true) { |
1216 | 94.1k | std::unique_ptr<char[]> buf(new char[size]); |
1217 | | #if defined(__linux__) |
1218 | | auto rc = readlink("/proc/self/exe", buf.get(), size); |
1219 | | if (rc == -1) { |
1220 | | return STATUS(IOError, "Unable to determine own executable path", "", Errno(errno)); |
1221 | | } else if (rc >= size) { |
1222 | | // The buffer wasn't large enough |
1223 | | size *= 2; |
1224 | | continue; |
1225 | | } |
1226 | | len = rc; |
1227 | | #elif defined(__APPLE__) |
1228 | 94.1k | if (_NSGetExecutablePath(buf.get(), &size) != 0) { |
1229 | | // The buffer wasn't large enough; 'size' has been updated. |
1230 | 47.0k | continue; |
1231 | 47.0k | } |
1232 | 47.0k | len = strlen(buf.get()); |
1233 | | #else |
1234 | | #error Unsupported platform |
1235 | | #endif |
1236 | | |
1237 | 47.0k | path->assign(buf.get(), len); |
1238 | 47.0k | break; |
1239 | 94.1k | } |
1240 | 47.0k | return Status::OK(); |
1241 | 47.0k | } |
1242 | | |
1243 | 236k | Status IsDirectory(const string& path, bool* is_dir) override { |
1244 | 236k | TRACE_EVENT1("io", "PosixEnv::IsDirectory", "path", path); |
1245 | 236k | ThreadRestrictions::AssertIOAllowed(); |
1246 | 236k | Status s; |
1247 | 236k | struct stat sbuf; |
1248 | 236k | if (stat(path.c_str(), &sbuf) != 0) { |
1249 | 71.4k | s = STATUS_IO_ERROR(path, errno); |
1250 | 164k | } else { |
1251 | 164k | *is_dir = S_ISDIR(sbuf.st_mode); |
1252 | 164k | } |
1253 | 236k | return s; |
1254 | 236k | } |
1255 | | |
1256 | 8.03k | Result<bool> IsExecutableFile(const std::string& path) override { |
1257 | 8.03k | TRACE_EVENT1("io", "PosixEnv::IsExecutableFile", "path", path); |
1258 | 8.03k | ThreadRestrictions::AssertIOAllowed(); |
1259 | 8.03k | Status s; |
1260 | 8.03k | struct stat sbuf; |
1261 | 8.03k | if (stat(path.c_str(), &sbuf) != 0) { |
1262 | 0 | if (errno == ENOENT) { |
1263 | | // If the file does not exist, we just return false. |
1264 | 0 | return false; |
1265 | 0 | } |
1266 | 0 | return STATUS_IO_ERROR(path, errno); |
1267 | 0 | } |
1268 | | |
1269 | 8.03k | return !S_ISDIR(sbuf.st_mode) && (sbuf.st_mode & S_IXUSR); |
1270 | 8.03k | } |
1271 | | |
1272 | 155k | Status Walk(const string& root, DirectoryOrder order, const WalkCallback& cb) override { |
1273 | 155k | TRACE_EVENT1("io", "PosixEnv::Walk", "path", root); |
1274 | 155k | ThreadRestrictions::AssertIOAllowed(); |
1275 | | // Some sanity checks |
1276 | 155k | CHECK_NE(root, "/"); |
1277 | 155k | CHECK_NE(root, "./"); |
1278 | 155k | CHECK_NE(root, "."); |
1279 | 155k | CHECK_NE(root, ""); |
1280 | | |
1281 | | // FTS requires a non-const copy of the name. strdup it and free() when |
1282 | | // we leave scope. |
1283 | 155k | std::unique_ptr<char, FreeDeleter> name_dup(strdup(root.c_str())); |
1284 | 155k | char *paths[] = { name_dup.get(), nullptr }; |
1285 | | |
1286 | | // FTS_NOCHDIR is important here to make this thread-safe. |
1287 | 155k | std::unique_ptr<FTS, FtsCloser> tree( |
1288 | 155k | fts_open(paths, FTS_PHYSICAL | FTS_XDEV | FTS_NOCHDIR, nullptr)); |
1289 | 155k | if (!tree.get()) { |
1290 | 0 | return STATUS_IO_ERROR(root, errno); |
1291 | 0 | } |
1292 | | |
1293 | 155k | FTSENT *ent = nullptr; |
1294 | 155k | bool had_errors = false; |
1295 | 835k | while ((ent = fts_read(tree.get())) != nullptr) { |
1296 | 679k | bool doCb = false; |
1297 | 679k | FileType type = DIRECTORY_TYPE; |
1298 | 679k | switch (ent->fts_info) { |
1299 | 204k | case FTS_D: // Directory in pre-order |
1300 | 204k | if (order == PRE_ORDER) { |
1301 | 5 | doCb = true; |
1302 | 5 | } |
1303 | 204k | break; |
1304 | 204k | case FTS_DP: // Directory in post-order |
1305 | 204k | if (order == POST_ORDER) { |
1306 | 204k | doCb = true; |
1307 | 204k | } |
1308 | 204k | break; |
1309 | 269k | case FTS_F: // A regular file |
1310 | 269k | case FTS_SL: // A symbolic link |
1311 | 269k | case FTS_SLNONE: // A broken symbolic link |
1312 | 269k | case FTS_DEFAULT: // Unknown type of file |
1313 | 269k | doCb = true; |
1314 | 269k | type = FILE_TYPE; |
1315 | 269k | break; |
1316 | | |
1317 | 0 | case FTS_ERR: |
1318 | 0 | LOG(WARNING) << "Unable to access file " << ent->fts_path |
1319 | 0 | << " during walk: " << strerror(ent->fts_errno); |
1320 | 0 | had_errors = true; |
1321 | 0 | break; |
1322 | | |
1323 | 766 | default: |
1324 | 766 | LOG(WARNING) << "Unable to access file " << ent->fts_path |
1325 | 766 | << " during walk (code " << ent->fts_info << ")"; |
1326 | 766 | break; |
1327 | 679k | } |
1328 | 679k | if (doCb) { |
1329 | 474k | if (!cb(type, DirName(ent->fts_path), ent->fts_name).ok()) { |
1330 | 3 | had_errors = true; |
1331 | 3 | } |
1332 | 474k | } |
1333 | 679k | } |
1334 | | |
1335 | 155k | if (had_errors) { |
1336 | 2 | return STATUS(IOError, root, "One or more errors occurred"); |
1337 | 2 | } |
1338 | 155k | return Status::OK(); |
1339 | 155k | } |
1340 | | |
1341 | 30.5k | Status Canonicalize(const string& path, string* result) override { |
1342 | 30.5k | TRACE_EVENT1("io", "PosixEnv::Canonicalize", "path", path); |
1343 | 30.5k | ThreadRestrictions::AssertIOAllowed(); |
1344 | 30.5k | std::unique_ptr<char[], FreeDeleter> r(realpath(path.c_str(), nullptr)); |
1345 | 30.5k | if (!r) { |
1346 | 1 | return STATUS_IO_ERROR(path, errno); |
1347 | 1 | } |
1348 | 30.5k | *result = string(r.get()); |
1349 | 30.5k | return Status::OK(); |
1350 | 30.5k | } |
1351 | | |
1352 | 128 | Status GetTotalRAMBytes(int64_t* ram) override { |
1353 | 128 | #if defined(__APPLE__) |
1354 | 128 | int mib[2]; |
1355 | 128 | size_t length = sizeof(*ram); |
1356 | | |
1357 | | // Get the Physical memory size |
1358 | 128 | mib[0] = CTL_HW; |
1359 | 128 | mib[1] = HW_MEMSIZE; |
1360 | 128 | CHECK_ERR(sysctl(mib, 2, ram, &length, nullptr, 0)) << "sysctl CTL_HW HW_MEMSIZE failed"0 ; |
1361 | | #else |
1362 | | struct sysinfo info; |
1363 | | if (sysinfo(&info) < 0) { |
1364 | | return STATUS_IO_ERROR("sysinfo() failed", errno); |
1365 | | } |
1366 | | *ram = info.totalram; |
1367 | | #endif |
1368 | 128 | return Status::OK(); |
1369 | 128 | } |
1370 | | |
1371 | 9.27k | FileFactory* GetFileFactory() { |
1372 | 9.27k | return file_factory_.get(); |
1373 | 9.27k | } |
1374 | | |
1375 | 10 | Result<uint64_t> GetFreeSpaceBytes(const std::string& path) override { |
1376 | 10 | if (PREDICT_FALSE(FLAGS_TEST_simulate_free_space_bytes >= 0)) { |
1377 | 0 | return FLAGS_TEST_simulate_free_space_bytes; |
1378 | 0 | } |
1379 | | |
1380 | 10 | auto stat = GetFilesystemStats(path); |
1381 | 10 | RETURN_NOT_OK(stat); |
1382 | 10 | uint64_t block_size = stat->f_frsize > 0 ? static_cast<uint64_t>(stat->f_frsize) : |
1383 | 10 | static_cast<uint64_t>(stat->f_bsize)0 ; |
1384 | 10 | uint64_t available_blocks = static_cast<uint64_t>(stat->f_bavail); |
1385 | | |
1386 | 10 | return available_blocks * block_size; |
1387 | 10 | } |
1388 | | |
1389 | 1.08M | Result<FilesystemStats> GetFilesystemStatsBytes(const std::string& path) override { |
1390 | 1.08M | auto stat = GetFilesystemStats(path); |
1391 | 1.08M | RETURN_NOT_OK(stat); |
1392 | 214k | uint64_t block_size = stat->f_frsize > 0 ? static_cast<uint64_t>(stat->f_frsize)214k : |
1393 | 214k | static_cast<uint64_t>(stat->f_bsize)1 ; |
1394 | 214k | uint64_t available_blocks = static_cast<uint64_t>(stat->f_bavail); |
1395 | 214k | uint64_t total_blocks = static_cast<uint64_t>(stat->f_blocks); |
1396 | | |
1397 | 214k | return FilesystemStats{available_blocks * block_size, |
1398 | 214k | (total_blocks - available_blocks) * block_size, |
1399 | 214k | total_blocks * block_size}; |
1400 | 1.08M | } |
1401 | | |
1402 | 279k | Result<ResourceLimits> GetUlimit(int resource) override { |
1403 | 279k | struct rlimit lim; |
1404 | 279k | if (getrlimit(resource, &lim) != 0) { |
1405 | 0 | return STATUS_IO_ERROR("getrlimit() failed", errno); |
1406 | 0 | } |
1407 | 279k | ResourceLimit soft(lim.rlim_cur); |
1408 | 279k | ResourceLimit hard(lim.rlim_max); |
1409 | 279k | ResourceLimits limits { soft, hard }; |
1410 | 279k | return limits; |
1411 | 279k | } |
1412 | | |
1413 | 0 | CHECKED_STATUS SetUlimit(int resource, ResourceLimit value) override { |
1414 | 0 | return SetUlimit(resource, value, strings::Substitute("resource no. $0", resource)); |
1415 | 0 | } |
1416 | | |
1417 | | CHECKED_STATUS SetUlimit( |
1418 | 29.3k | int resource, ResourceLimit value, const std::string& resource_name) override { |
1419 | | |
1420 | 29.3k | auto limits = VERIFY_RESULT(GetUlimit(resource)); |
1421 | 29.3k | if (limits.soft == value) { |
1422 | 14.6k | return Status::OK(); |
1423 | 14.6k | } |
1424 | 14.6k | if (limits.hard < value) { |
1425 | 0 | return STATUS_FORMAT( |
1426 | 0 | InvalidArgument, |
1427 | 0 | "Resource limit value $0 for resource $1 greater than hard limit $2", |
1428 | 0 | value, resource, limits.hard.ToString()); |
1429 | 0 | } |
1430 | 14.6k | struct rlimit lim; |
1431 | 14.6k | lim.rlim_cur = value.RawValue(); |
1432 | 14.6k | lim.rlim_max = limits.hard.RawValue(); |
1433 | 14.6k | LOG(INFO) |
1434 | 14.6k | << "Modifying limit for " << resource_name |
1435 | 14.6k | << " from " << limits.soft.ToString() |
1436 | 14.6k | << " to " << value.ToString(); |
1437 | 14.6k | if (setrlimit(resource, &lim) != 0) { |
1438 | 0 | return STATUS(RuntimeError, "Unable to set rlimit", Errno(errno)); |
1439 | 0 | } |
1440 | 14.6k | return Status::OK(); |
1441 | 14.6k | } |
1442 | | |
1443 | 0 | bool IsEncrypted() const override { |
1444 | 0 | return file_factory_->IsEncrypted(); |
1445 | 0 | } |
1446 | | |
1447 | | private: |
1448 | | // std::unique_ptr Deleter implementation for fts_close |
1449 | | struct FtsCloser { |
1450 | 155k | void operator()(FTS *fts) const { |
1451 | 155k | if (fts) { fts_close(fts); } |
1452 | 155k | } |
1453 | | }; |
1454 | | |
1455 | 474k | Status DeleteRecursivelyCb(FileType type, const string& dirname, const string& basename) { |
1456 | 474k | string full_path = JoinPathSegments(dirname, basename); |
1457 | 474k | Status s; |
1458 | 474k | switch (type) { |
1459 | 269k | case FILE_TYPE: |
1460 | 269k | s = DeleteFile(full_path); |
1461 | 269k | WARN_NOT_OK(s, "Could not delete file"); |
1462 | 269k | return s; |
1463 | 204k | case DIRECTORY_TYPE: |
1464 | 204k | s = DeleteDir(full_path); |
1465 | 204k | WARN_NOT_OK(s, "Could not delete directory"); |
1466 | 204k | return s; |
1467 | 0 | default: |
1468 | 0 | LOG(FATAL) << "Unknown file type: " << type; |
1469 | 0 | return Status::OK(); |
1470 | 474k | } |
1471 | 474k | } |
1472 | | |
1473 | | std::unique_ptr<FileFactory> file_factory_; |
1474 | | }; |
1475 | | |
1476 | | class PosixFileFactory : public FileFactory { |
1477 | | #if defined(__linux__) |
1478 | | static constexpr int kODirectFlags = O_DIRECT | O_NOATIME | O_SYNC; |
1479 | | #else |
1480 | | static constexpr int kODirectFlags = 0; |
1481 | | #endif |
1482 | | |
1483 | | public: |
1484 | 26.8k | PosixFileFactory() {} |
1485 | 0 | ~PosixFileFactory() {} |
1486 | | |
1487 | | Status NewSequentialFile( |
1488 | 4.10k | const std::string& fname, std::unique_ptr<SequentialFile>* result) override { |
1489 | 4.10k | TRACE_EVENT1("io", "PosixEnv::NewSequentialFile", "path", fname); |
1490 | 4.10k | ThreadRestrictions::AssertIOAllowed(); |
1491 | 4.10k | FILE* f = fopen(fname.c_str(), "r"); |
1492 | 4.10k | if (f == nullptr) { |
1493 | 0 | return STATUS_IO_ERROR(fname, errno); |
1494 | 4.10k | } else { |
1495 | 4.10k | result->reset(new yb::PosixSequentialFile(fname, f, yb::FileSystemOptions::kDefault)); |
1496 | 4.10k | return Status::OK(); |
1497 | 4.10k | } |
1498 | 4.10k | } |
1499 | | |
1500 | | Status NewRandomAccessFile(const std::string& fname, |
1501 | 874k | std::unique_ptr<RandomAccessFile>* result) override { |
1502 | 874k | TRACE_EVENT1("io", "PosixEnv::NewRandomAccessFile", "path", fname); |
1503 | 874k | ThreadRestrictions::AssertIOAllowed(); |
1504 | 874k | int fd = open(fname.c_str(), O_RDONLY); |
1505 | 874k | if (fd < 0) { |
1506 | 26.3k | return STATUS_IO_ERROR(fname, errno); |
1507 | 26.3k | } |
1508 | | |
1509 | 847k | result->reset(new yb::PosixRandomAccessFile(fname, fd, yb::FileSystemOptions::kDefault)); |
1510 | 847k | return Status::OK(); |
1511 | 874k | } |
1512 | | |
1513 | | Status NewWritableFile(const std::string& fname, |
1514 | 2.33k | std::unique_ptr<WritableFile>* result) override { |
1515 | 2.33k | return NewWritableFile(WritableFileOptions(), fname, result); |
1516 | 2.33k | } |
1517 | | |
1518 | | Status NewWritableFile(const WritableFileOptions& opts, |
1519 | | const std::string& fname, |
1520 | 13.7k | std::unique_ptr<WritableFile>* result) override { |
1521 | 13.7k | TRACE_EVENT1("io", "PosixEnv::NewWritableFile", "path", fname); |
1522 | 13.7k | int fd = -1; |
1523 | 13.7k | int extra_flags = 0; |
1524 | 13.7k | if (UseODirect(opts.o_direct)) { |
1525 | 0 | extra_flags = kODirectFlags; |
1526 | 0 | } |
1527 | 13.7k | RETURN_NOT_OK(DoOpen(fname, opts.mode, &fd, extra_flags)); |
1528 | 13.7k | return InstantiateNewWritableFile(fname, fd, opts, result); |
1529 | 13.7k | } |
1530 | | |
1531 | | Status NewTempWritableFile(const WritableFileOptions& opts, |
1532 | | const std::string& name_template, |
1533 | | std::string* created_filename, |
1534 | 2.31M | std::unique_ptr<WritableFile>* result) override { |
1535 | 2.31M | TRACE_EVENT1("io", "PosixEnv::NewTempWritableFile", "template", name_template); |
1536 | 2.31M | ThreadRestrictions::AssertIOAllowed(); |
1537 | 2.31M | std::unique_ptr<char[]> fname(new char[name_template.size() + 1]); |
1538 | 2.31M | ::snprintf(fname.get(), name_template.size() + 1, "%s", name_template.c_str()); |
1539 | 2.31M | int fd = -1; |
1540 | 2.31M | if (UseODirect(opts.o_direct)) { |
1541 | 0 | fd = ::mkostemp(fname.get(), kODirectFlags); |
1542 | 2.31M | } else { |
1543 | 2.31M | fd = ::mkstemp(fname.get()); |
1544 | 2.31M | } |
1545 | 2.31M | if (fd < 0) { |
1546 | 15 | return STATUS_IO_ERROR(Format("Call to mkstemp() failed on name template $0", name_template), |
1547 | 15 | errno); |
1548 | 15 | } |
1549 | 2.31M | *created_filename = fname.get(); |
1550 | 2.31M | return InstantiateNewWritableFile(*created_filename, fd, opts, result); |
1551 | 2.31M | } |
1552 | | |
1553 | 1 | Status NewRWFile(const string& fname, std::unique_ptr<RWFile>* result) override { |
1554 | 1 | return NewRWFile(RWFileOptions(), fname, result); |
1555 | 1 | } |
1556 | | |
1557 | | Status NewRWFile(const RWFileOptions& opts, const string& fname, |
1558 | 6 | std::unique_ptr<RWFile>* result) override { |
1559 | 6 | TRACE_EVENT1("io", "PosixEnv::NewRWFile", "path", fname); |
1560 | 6 | int fd = -1; |
1561 | 6 | RETURN_NOT_OK(DoOpen(fname, opts.mode, &fd)); |
1562 | 5 | result->reset(new PosixRWFile(fname, fd, opts.sync_on_close)); |
1563 | 5 | return Status::OK(); |
1564 | 6 | } |
1565 | | |
1566 | 834k | Result<uint64_t> GetFileSize(const std::string& fname) override { |
1567 | 834k | return GetFileStat( |
1568 | 834k | fname, "PosixEnv::GetFileSize", [](const struct stat& sbuf) { return sbuf.st_size; }834k ); |
1569 | 834k | } |
1570 | | |
1571 | 0 | bool IsEncrypted() const override { |
1572 | 0 | return false; |
1573 | 0 | } |
1574 | | |
1575 | | private: |
1576 | 2.32M | bool UseODirect(bool o_direct) { |
1577 | | #if defined(__linux__) |
1578 | | return o_direct; |
1579 | | #else |
1580 | 2.32M | return false; |
1581 | 2.32M | #endif |
1582 | 2.32M | } |
1583 | | |
1584 | | Status InstantiateNewWritableFile(const std::string& fname, |
1585 | | int fd, |
1586 | | const WritableFileOptions& opts, |
1587 | 2.32M | std::unique_ptr<WritableFile>* result) { |
1588 | 2.32M | uint64_t file_size = 0; |
1589 | 2.32M | if (opts.mode == PosixEnv::OPEN_EXISTING) { |
1590 | 1 | auto lseek_result = lseek(fd, 0, SEEK_END); |
1591 | 1 | if (lseek_result < 0) { |
1592 | 0 | return STATUS_IO_ERROR(fname, errno); |
1593 | 0 | } |
1594 | 1 | file_size = lseek_result; |
1595 | 1 | } |
1596 | | #if defined(__linux__) |
1597 | | if (opts.o_direct) |
1598 | | *result = std::make_unique<PosixDirectIOWritableFile>( |
1599 | | fname, fd, file_size, opts.sync_on_close); |
1600 | | else |
1601 | | #endif |
1602 | 2.32M | *result = std::make_unique<PosixWritableFile>(fname, fd, file_size, opts.sync_on_close); |
1603 | 2.32M | return Status::OK(); |
1604 | 2.32M | } |
1605 | | }; |
1606 | | |
1607 | 26.8k | PosixEnv::PosixEnv() : file_factory_(std::make_unique<PosixFileFactory>()) {} |
1608 | | PosixEnv::PosixEnv(std::unique_ptr<FileFactory> file_factory) : |
1609 | 9.27k | file_factory_(std::move(file_factory)) {} |
1610 | | |
1611 | | } // namespace |
1612 | | |
1613 | | static pthread_once_t once = PTHREAD_ONCE_INIT; |
1614 | | static Env* default_env; |
1615 | 26.8k | static void InitDefaultEnv() { default_env = new PosixEnv; } |
1616 | | |
1617 | 10.8G | Env* Env::Default() { |
1618 | 10.8G | pthread_once(&once, InitDefaultEnv); |
1619 | 10.8G | return default_env; |
1620 | 10.8G | } |
1621 | | |
1622 | 9.27k | FileFactory* Env::DefaultFileFactory() { |
1623 | 9.27k | return down_cast<PosixEnv*>(Env::Default())->GetFileFactory(); |
1624 | 9.27k | } |
1625 | | |
1626 | 9.27k | std::unique_ptr<Env> Env::NewDefaultEnv(std::unique_ptr<FileFactory> file_factory) { |
1627 | 9.27k | return std::make_unique<PosixEnv>(std::move(file_factory)); |
1628 | 9.27k | } |
1629 | | |
1630 | | |
1631 | | } // namespace yb |