filesystem-disk-unix.c++ (60907B)
1 // Copyright (c) 2015 Sandstorm Development Group, Inc. and contributors 2 // Licensed under the MIT License: 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a copy 5 // of this software and associated documentation files (the "Software"), to deal 6 // in the Software without restriction, including without limitation the rights 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 // copies of the Software, and to permit persons to whom the Software is 9 // furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 // THE SOFTWARE. 21 22 #if !_WIN32 23 24 #ifndef _GNU_SOURCE 25 #define _GNU_SOURCE 26 #endif 27 28 #include "filesystem.h" 29 #include "debug.h" 30 #include <sys/types.h> 31 #include <sys/stat.h> 32 #include <sys/ioctl.h> 33 #include <fcntl.h> 34 #include <unistd.h> 35 #include <stdio.h> 36 #include <sys/mman.h> 37 #include <errno.h> 38 #include <dirent.h> 39 #include <stdlib.h> 40 #include "vector.h" 41 #include "miniposix.h" 42 #include <algorithm> 43 44 #if __linux__ 45 #include <syscall.h> 46 #include <linux/fs.h> 47 #include <sys/sendfile.h> 48 #endif 49 50 namespace kj { 51 namespace { 52 53 #define HIDDEN_PREFIX ".kj-tmp." 54 // Prefix for temp files which should be hidden when listing a directory. 55 // 56 // If you change this, make sure to update the unit test. 57 58 #ifdef O_CLOEXEC 59 #define MAYBE_O_CLOEXEC O_CLOEXEC 60 #else 61 #define MAYBE_O_CLOEXEC 0 62 #endif 63 64 #ifdef O_DIRECTORY 65 #define MAYBE_O_DIRECTORY O_DIRECTORY 66 #else 67 #define MAYBE_O_DIRECTORY 0 68 #endif 69 70 #if __APPLE__ 71 // Mac OSX defines SEEK_HOLE, but it doesn't work. ("Inappropriate ioctl for device", it says.) 72 #undef SEEK_HOLE 73 #endif 74 75 #if __BIONIC__ 76 // No no DTTOIF function 77 #undef DT_UNKNOWN 78 #endif 79 80 static void setCloexec(int fd) KJ_UNUSED; 81 static void setCloexec(int fd) { 82 // Set the O_CLOEXEC flag on the given fd. 83 // 84 // We try to avoid the need to call this by taking advantage of syscall flags that set it 85 // atomically on new file descriptors. Unfortunately some platforms do not support such syscalls. 86 87 #ifdef FIOCLEX 88 // Yay, we can set the flag in one call. 89 KJ_SYSCALL_HANDLE_ERRORS(ioctl(fd, FIOCLEX)) { 90 case EINVAL: 91 case EOPNOTSUPP: 92 break; 93 default: 94 KJ_FAIL_SYSCALL("ioctl(fd, FIOCLEX)", error) { break; } 95 break; 96 } else { 97 // success 98 return; 99 } 100 #endif 101 102 // Sadness, we must resort to read/modify/write. 103 // 104 // (On many platforms, FD_CLOEXEC is the only flag modifiable via F_SETFD and therefore we could 105 // skip the read... but it seems dangerous to assume that's true of all platforms, and anyway 106 // most platforms support FIOCLEX.) 107 int flags; 108 KJ_SYSCALL(flags = fcntl(fd, F_GETFD)); 109 if (!(flags & FD_CLOEXEC)) { 110 KJ_SYSCALL(fcntl(fd, F_SETFD, flags | FD_CLOEXEC)); 111 } 112 } 113 114 static Date toKjDate(struct timespec tv) { 115 return tv.tv_sec * SECONDS + tv.tv_nsec * NANOSECONDS + UNIX_EPOCH; 116 } 117 118 static FsNode::Type modeToType(mode_t mode) { 119 switch (mode & S_IFMT) { 120 case S_IFREG : return FsNode::Type::FILE; 121 case S_IFDIR : return FsNode::Type::DIRECTORY; 122 case S_IFLNK : return FsNode::Type::SYMLINK; 123 case S_IFBLK : return FsNode::Type::BLOCK_DEVICE; 124 case S_IFCHR : return FsNode::Type::CHARACTER_DEVICE; 125 case S_IFIFO : return FsNode::Type::NAMED_PIPE; 126 case S_IFSOCK: return FsNode::Type::SOCKET; 127 default: return FsNode::Type::OTHER; 128 } 129 } 130 131 static FsNode::Metadata statToMetadata(struct stat& stats) { 132 // Probably st_ino and st_dev are usually under 32 bits, so mix by rotating st_dev left 32 bits 133 // and XOR. 134 uint64_t d = stats.st_dev; 135 uint64_t hash = ((d << 32) | (d >> 32)) ^ stats.st_ino; 136 137 return FsNode::Metadata { 138 modeToType(stats.st_mode), 139 implicitCast<uint64_t>(stats.st_size), 140 implicitCast<uint64_t>(stats.st_blocks * 512u), 141 #if __APPLE__ 142 toKjDate(stats.st_mtimespec), 143 #else 144 toKjDate(stats.st_mtim), 145 #endif 146 implicitCast<uint>(stats.st_nlink), 147 hash 148 }; 149 } 150 151 static bool rmrf(int fd, StringPtr path); 152 153 static void rmrfChildrenAndClose(int fd) { 154 // Assumes fd is seeked to beginning. 155 156 DIR* dir = fdopendir(fd); 157 if (dir == nullptr) { 158 close(fd); 159 KJ_FAIL_SYSCALL("fdopendir", errno); 160 }; 161 KJ_DEFER(closedir(dir)); 162 163 for (;;) { 164 errno = 0; 165 struct dirent* entry = readdir(dir); 166 if (entry == nullptr) { 167 int error = errno; 168 if (error == 0) { 169 break; 170 } else { 171 KJ_FAIL_SYSCALL("readdir", error); 172 } 173 } 174 175 if (entry->d_name[0] == '.' && 176 (entry->d_name[1] == '\0' || 177 (entry->d_name[1] == '.' && 178 entry->d_name[2] == '\0'))) { 179 // ignore . and .. 180 } else { 181 #ifdef DT_UNKNOWN // d_type is not available on all platforms. 182 if (entry->d_type == DT_DIR) { 183 int subdirFd; 184 KJ_SYSCALL(subdirFd = openat( 185 fd, entry->d_name, O_RDONLY | MAYBE_O_DIRECTORY | MAYBE_O_CLOEXEC)); 186 rmrfChildrenAndClose(subdirFd); 187 KJ_SYSCALL(unlinkat(fd, entry->d_name, AT_REMOVEDIR)); 188 } else if (entry->d_type != DT_UNKNOWN) { 189 KJ_SYSCALL(unlinkat(fd, entry->d_name, 0)); 190 } else { 191 #endif 192 KJ_ASSERT(rmrf(fd, entry->d_name)); 193 #ifdef DT_UNKNOWN 194 } 195 #endif 196 } 197 } 198 } 199 200 static bool rmrf(int fd, StringPtr path) { 201 struct stat stats; 202 KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, path.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { 203 case ENOENT: 204 case ENOTDIR: 205 // Doesn't exist. 206 return false; 207 default: 208 KJ_FAIL_SYSCALL("lstat(path)", error, path) { return false; } 209 } 210 211 if (S_ISDIR(stats.st_mode)) { 212 int subdirFd; 213 KJ_SYSCALL(subdirFd = openat( 214 fd, path.cStr(), O_RDONLY | MAYBE_O_DIRECTORY | MAYBE_O_CLOEXEC)) { return false; } 215 rmrfChildrenAndClose(subdirFd); 216 KJ_SYSCALL(unlinkat(fd, path.cStr(), AT_REMOVEDIR)) { return false; } 217 } else { 218 KJ_SYSCALL(unlinkat(fd, path.cStr(), 0)) { return false; } 219 } 220 221 return true; 222 } 223 224 struct MmapRange { 225 uint64_t offset; 226 uint64_t size; 227 }; 228 229 static MmapRange getMmapRange(uint64_t offset, uint64_t size) { 230 // Comes up with an offset and size to pass to mmap(), given an offset and size requested by 231 // the caller, and considering the fact that mappings must start at a page boundary. 232 // 233 // The offset is rounded down to the nearest page boundary, and the size is increased to 234 // compensate. Note that the endpoint of the mapping is *not* rounded up to a page boundary, as 235 // mmap() does not actually require this, and it causes trouble on some systems (notably Cygwin). 236 237 #ifndef _SC_PAGESIZE 238 #define _SC_PAGESIZE _SC_PAGE_SIZE 239 #endif 240 static const uint64_t pageSize = sysconf(_SC_PAGESIZE); 241 uint64_t pageMask = pageSize - 1; 242 243 uint64_t realOffset = offset & ~pageMask; 244 245 return { realOffset, offset + size - realOffset }; 246 } 247 248 class MmapDisposer: public ArrayDisposer { 249 protected: 250 void disposeImpl(void* firstElement, size_t elementSize, size_t elementCount, 251 size_t capacity, void (*destroyElement)(void*)) const { 252 auto range = getMmapRange(reinterpret_cast<uintptr_t>(firstElement), 253 elementSize * elementCount); 254 KJ_SYSCALL(munmap(reinterpret_cast<byte*>(range.offset), range.size)) { break; } 255 } 256 }; 257 258 constexpr MmapDisposer mmapDisposer = MmapDisposer(); 259 260 class DiskHandle { 261 // We need to implement each of ReadableFile, AppendableFile, File, ReadableDirectory, and 262 // Directory for disk handles. There is a lot of implementation overlap between these, especially 263 // stat(), sync(), etc. We can't have everything inherit from a common DiskFsNode that implements 264 // these because then we get diamond inheritance which means we need to make all our inheritance 265 // virtual which means downcasting requires RTTI which violates our goal of supporting compiling 266 // with no RTTI. So instead we have the DiskHandle class which implements all the methods without 267 // inheriting anything, and then we have DiskFile, DiskDirectory, etc. hold this and delegate to 268 // it. Ugly, but works. 269 270 public: 271 DiskHandle(AutoCloseFd&& fd): fd(kj::mv(fd)) {} 272 273 // OsHandle ------------------------------------------------------------------ 274 275 AutoCloseFd clone() const { 276 int fd2; 277 #ifdef F_DUPFD_CLOEXEC 278 KJ_SYSCALL_HANDLE_ERRORS(fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 3)) { 279 case EINVAL: 280 case EOPNOTSUPP: 281 // fall back 282 break; 283 default: 284 KJ_FAIL_SYSCALL("fnctl(fd, F_DUPFD_CLOEXEC, 3)", error) { break; } 285 break; 286 } else { 287 return AutoCloseFd(fd2); 288 } 289 #endif 290 291 KJ_SYSCALL(fd2 = ::dup(fd)); 292 AutoCloseFd result(fd2); 293 setCloexec(result); 294 return result; 295 } 296 297 int getFd() const { 298 return fd.get(); 299 } 300 301 // FsNode -------------------------------------------------------------------- 302 303 FsNode::Metadata stat() const { 304 struct stat stats; 305 KJ_SYSCALL(::fstat(fd, &stats)); 306 return statToMetadata(stats); 307 } 308 309 void sync() const { 310 #if __APPLE__ 311 // For whatever reason, fsync() on OSX only flushes kernel buffers. It does not flush hardware 312 // disk buffers. This makes it not very useful. But OSX documents fcntl F_FULLFSYNC which does 313 // the right thing. Why they don't just make fsync() do the right thing, I do not know. 314 KJ_SYSCALL(fcntl(fd, F_FULLFSYNC)); 315 #else 316 KJ_SYSCALL(fsync(fd)); 317 #endif 318 } 319 320 void datasync() const { 321 // The presence of the _POSIX_SYNCHRONIZED_IO define is supposed to tell us that fdatasync() 322 // exists. But Apple defines this yet doesn't offer fdatasync(). Thanks, Apple. 323 #if _POSIX_SYNCHRONIZED_IO && !__APPLE__ 324 KJ_SYSCALL(fdatasync(fd)); 325 #else 326 this->sync(); 327 #endif 328 } 329 330 // ReadableFile -------------------------------------------------------------- 331 332 size_t read(uint64_t offset, ArrayPtr<byte> buffer) const { 333 // pread() probably never returns short reads unless it hits EOF. Unfortunately, though, per 334 // spec we are not allowed to assume this. 335 336 size_t total = 0; 337 while (buffer.size() > 0) { 338 ssize_t n; 339 KJ_SYSCALL(n = pread(fd, buffer.begin(), buffer.size(), offset)); 340 if (n == 0) break; 341 total += n; 342 offset += n; 343 buffer = buffer.slice(n, buffer.size()); 344 } 345 return total; 346 } 347 348 Array<const byte> mmap(uint64_t offset, uint64_t size) const { 349 if (size == 0) return nullptr; // zero-length mmap() returns EINVAL, so avoid it 350 auto range = getMmapRange(offset, size); 351 const void* mapping = ::mmap(NULL, range.size, PROT_READ, MAP_SHARED, fd, range.offset); 352 if (mapping == MAP_FAILED) { 353 KJ_FAIL_SYSCALL("mmap", errno); 354 } 355 return Array<const byte>(reinterpret_cast<const byte*>(mapping) + (offset - range.offset), 356 size, mmapDisposer); 357 } 358 359 Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const { 360 if (size == 0) return nullptr; // zero-length mmap() returns EINVAL, so avoid it 361 auto range = getMmapRange(offset, size); 362 void* mapping = ::mmap(NULL, range.size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, range.offset); 363 if (mapping == MAP_FAILED) { 364 KJ_FAIL_SYSCALL("mmap", errno); 365 } 366 return Array<byte>(reinterpret_cast<byte*>(mapping) + (offset - range.offset), 367 size, mmapDisposer); 368 } 369 370 // File ---------------------------------------------------------------------- 371 372 void write(uint64_t offset, ArrayPtr<const byte> data) const { 373 // pwrite() probably never returns short writes unless there's no space left on disk. 374 // Unfortunately, though, per spec we are not allowed to assume this. 375 376 while (data.size() > 0) { 377 ssize_t n; 378 KJ_SYSCALL(n = pwrite(fd, data.begin(), data.size(), offset)); 379 KJ_ASSERT(n > 0, "pwrite() returned zero?"); 380 offset += n; 381 data = data.slice(n, data.size()); 382 } 383 } 384 385 void zero(uint64_t offset, uint64_t size) const { 386 // If FALLOC_FL_PUNCH_HOLE is defined, use it to efficiently zero the area. 387 // 388 // A fallocate() wrapper was only added to Android's Bionic C library as of API level 21, 389 // but FALLOC_FL_PUNCH_HOLE is apparently defined in the headers before that, so we'll 390 // have to explicitly test for that case. 391 #if defined(FALLOC_FL_PUNCH_HOLE) && !(__ANDROID__ && __BIONIC__ && __ANDROID_API__ < 21) 392 KJ_SYSCALL_HANDLE_ERRORS( 393 fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, size)) { 394 case EOPNOTSUPP: 395 // fall back to below 396 break; 397 default: 398 KJ_FAIL_SYSCALL("fallocate(FALLOC_FL_PUNCH_HOLE)", error) { return; } 399 } else { 400 return; 401 } 402 #endif 403 404 static const byte ZEROS[4096] = { 0 }; 405 406 #if __APPLE__ || __CYGWIN__ || (defined(__ANDROID__) && __ANDROID_API__ < 24) 407 // Mac & Cygwin & Android API levels 23 and lower doesn't have pwritev(). 408 while (size > sizeof(ZEROS)) { 409 write(offset, ZEROS); 410 size -= sizeof(ZEROS); 411 offset += sizeof(ZEROS); 412 } 413 write(offset, kj::arrayPtr(ZEROS, size)); 414 #else 415 // Use a 4k buffer of zeros amplified by iov to write zeros with as few syscalls as possible. 416 size_t count = (size + sizeof(ZEROS) - 1) / sizeof(ZEROS); 417 const size_t iovmax = miniposix::iovMax(); 418 KJ_STACK_ARRAY(struct iovec, iov, kj::min(iovmax, count), 16, 256); 419 420 for (auto& item: iov) { 421 item.iov_base = const_cast<byte*>(ZEROS); 422 item.iov_len = sizeof(ZEROS); 423 } 424 425 while (size > 0) { 426 size_t iovCount; 427 if (size >= iov.size() * sizeof(ZEROS)) { 428 iovCount = iov.size(); 429 } else { 430 iovCount = size / sizeof(ZEROS); 431 size_t rem = size % sizeof(ZEROS); 432 if (rem > 0) { 433 iov[iovCount++].iov_len = rem; 434 } 435 } 436 437 ssize_t n; 438 KJ_SYSCALL(n = pwritev(fd, iov.begin(), count, offset)); 439 KJ_ASSERT(n > 0, "pwrite() returned zero?"); 440 441 offset += n; 442 size -= n; 443 } 444 #endif 445 } 446 447 void truncate(uint64_t size) const { 448 KJ_SYSCALL(ftruncate(fd, size)); 449 } 450 451 class WritableFileMappingImpl final: public WritableFileMapping { 452 public: 453 WritableFileMappingImpl(Array<byte> bytes): bytes(kj::mv(bytes)) {} 454 455 ArrayPtr<byte> get() const override { 456 // const_cast OK because WritableFileMapping does indeed provide a writable view despite 457 // being const itself. 458 return arrayPtr(const_cast<byte*>(bytes.begin()), bytes.size()); 459 } 460 461 void changed(ArrayPtr<byte> slice) const override { 462 KJ_REQUIRE(slice.begin() >= bytes.begin() && slice.end() <= bytes.end(), 463 "byte range is not part of this mapping"); 464 if (slice.size() == 0) return; 465 466 // msync() requires page-alignment, apparently, so use getMmapRange() to accomplish that. 467 auto range = getMmapRange(reinterpret_cast<uintptr_t>(slice.begin()), slice.size()); 468 KJ_SYSCALL(msync(reinterpret_cast<void*>(range.offset), range.size, MS_ASYNC)); 469 } 470 471 void sync(ArrayPtr<byte> slice) const override { 472 KJ_REQUIRE(slice.begin() >= bytes.begin() && slice.end() <= bytes.end(), 473 "byte range is not part of this mapping"); 474 if (slice.size() == 0) return; 475 476 // msync() requires page-alignment, apparently, so use getMmapRange() to accomplish that. 477 auto range = getMmapRange(reinterpret_cast<uintptr_t>(slice.begin()), slice.size()); 478 KJ_SYSCALL(msync(reinterpret_cast<void*>(range.offset), range.size, MS_SYNC)); 479 } 480 481 private: 482 Array<byte> bytes; 483 }; 484 485 Own<const WritableFileMapping> mmapWritable(uint64_t offset, uint64_t size) const { 486 if (size == 0) { 487 // zero-length mmap() returns EINVAL, so avoid it 488 return heap<WritableFileMappingImpl>(nullptr); 489 } 490 auto range = getMmapRange(offset, size); 491 void* mapping = ::mmap(NULL, range.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, range.offset); 492 if (mapping == MAP_FAILED) { 493 KJ_FAIL_SYSCALL("mmap", errno); 494 } 495 auto array = Array<byte>(reinterpret_cast<byte*>(mapping) + (offset - range.offset), 496 size, mmapDisposer); 497 return heap<WritableFileMappingImpl>(kj::mv(array)); 498 } 499 500 size_t copyChunk(uint64_t offset, int fromFd, uint64_t fromOffset, uint64_t size) const { 501 // Copies a range of bytes from `fromFd` to this file in the most efficient way possible for 502 // the OS. Only returns less than `size` if EOF. Does not account for holes. 503 504 #if __linux__ 505 { 506 KJ_SYSCALL(lseek(fd, offset, SEEK_SET)); 507 off_t fromPos = fromOffset; 508 off_t end = fromOffset + size; 509 while (fromPos < end) { 510 ssize_t n; 511 KJ_SYSCALL_HANDLE_ERRORS(n = sendfile(fd, fromFd, &fromPos, end - fromPos)) { 512 case EINVAL: 513 case ENOSYS: 514 goto sendfileNotAvailable; 515 default: 516 KJ_FAIL_SYSCALL("sendfile", error) { return fromPos - fromOffset; } 517 } 518 if (n == 0) break; 519 } 520 return fromPos - fromOffset; 521 } 522 523 sendfileNotAvailable: 524 #endif 525 uint64_t total = 0; 526 while (size > 0) { 527 byte buffer[4096]; 528 ssize_t n; 529 KJ_SYSCALL(n = pread(fromFd, buffer, kj::min(sizeof(buffer), size), fromOffset)); 530 if (n == 0) break; 531 write(offset, arrayPtr(buffer, n)); 532 fromOffset += n; 533 offset += n; 534 total += n; 535 size -= n; 536 } 537 return total; 538 } 539 540 kj::Maybe<size_t> copy(uint64_t offset, const ReadableFile& from, 541 uint64_t fromOffset, uint64_t size) const { 542 KJ_IF_MAYBE(otherFd, from.getFd()) { 543 #ifdef FICLONE 544 if (offset == 0 && fromOffset == 0 && size == kj::maxValue && stat().size == 0) { 545 if (ioctl(fd, FICLONE, *otherFd) >= 0) { 546 return stat().size; 547 } 548 } else if (size > 0) { // src_length = 0 has special meaning for the syscall, so avoid. 549 struct file_clone_range range; 550 memset(&range, 0, sizeof(range)); 551 range.src_fd = *otherFd; 552 range.dest_offset = offset; 553 range.src_offset = fromOffset; 554 range.src_length = size == kj::maxValue ? 0 : size; 555 if (ioctl(fd, FICLONERANGE, &range) >= 0) { 556 // TODO(someday): What does FICLONERANGE actually do if the range goes past EOF? The docs 557 // don't say. Maybe it only copies the parts that exist. Maybe it punches holes for the 558 // rest. Where does the destination file's EOF marker end up? Who knows? 559 return kj::min(from.stat().size - fromOffset, size); 560 } 561 } else { 562 // size == 0 563 return size_t(0); 564 } 565 566 // ioctl failed. Almost all failures documented for these are of the form "the operation is 567 // not supported for the filesystem(s) specified", so fall back to other approaches. 568 #endif 569 570 off_t toPos = offset; 571 off_t fromPos = fromOffset; 572 off_t end = size == kj::maxValue ? off_t(kj::maxValue) : off_t(fromOffset + size); 573 574 for (;;) { 575 // Handle data. 576 { 577 // Find out how much data there is before the next hole. 578 off_t nextHole; 579 #ifdef SEEK_HOLE 580 KJ_SYSCALL_HANDLE_ERRORS(nextHole = lseek(*otherFd, fromPos, SEEK_HOLE)) { 581 case EINVAL: 582 // SEEK_HOLE probably not supported. Assume no holes. 583 nextHole = end; 584 break; 585 case ENXIO: 586 // Past EOF. Stop here. 587 return fromPos - fromOffset; 588 default: 589 KJ_FAIL_SYSCALL("lseek(fd, pos, SEEK_HOLE)", error) { return fromPos - fromOffset; } 590 } 591 #else 592 // SEEK_HOLE not supported. Assume no holes. 593 nextHole = end; 594 #endif 595 596 // Copy the next chunk of data. 597 off_t copyTo = kj::min(end, nextHole); 598 size_t amount = copyTo - fromPos; 599 if (amount > 0) { 600 size_t n = copyChunk(toPos, *otherFd, fromPos, amount); 601 fromPos += n; 602 toPos += n; 603 604 if (n < amount) { 605 return fromPos - fromOffset; 606 } 607 } 608 609 if (fromPos == end) { 610 return fromPos - fromOffset; 611 } 612 } 613 614 #ifdef SEEK_HOLE 615 // Handle hole. 616 { 617 // Find out how much hole there is before the next data. 618 off_t nextData; 619 KJ_SYSCALL_HANDLE_ERRORS(nextData = lseek(*otherFd, fromPos, SEEK_DATA)) { 620 case EINVAL: 621 // SEEK_DATA probably not supported. But we should only have gotten here if we 622 // were expecting a hole. 623 KJ_FAIL_ASSERT("can't determine hole size; SEEK_DATA not supported"); 624 break; 625 case ENXIO: 626 // No more data. Set to EOF. 627 KJ_SYSCALL(nextData = lseek(*otherFd, 0, SEEK_END)); 628 if (nextData > end) { 629 end = nextData; 630 } 631 break; 632 default: 633 KJ_FAIL_SYSCALL("lseek(fd, pos, SEEK_HOLE)", error) { return fromPos - fromOffset; } 634 } 635 636 // Write zeros. 637 off_t zeroTo = kj::min(end, nextData); 638 off_t amount = zeroTo - fromPos; 639 if (amount > 0) { 640 zero(toPos, amount); 641 toPos += amount; 642 fromPos = zeroTo; 643 } 644 645 if (fromPos == end) { 646 return fromPos - fromOffset; 647 } 648 } 649 #endif 650 } 651 } 652 653 // Indicates caller should call File::copy() default implementation. 654 return nullptr; 655 } 656 657 // ReadableDirectory --------------------------------------------------------- 658 659 template <typename Func> 660 auto list(bool needTypes, Func&& func) const 661 -> Array<Decay<decltype(func(instance<StringPtr>(), instance<FsNode::Type>()))>> { 662 // Seek to start of directory. 663 KJ_SYSCALL(lseek(fd, 0, SEEK_SET)); 664 665 // Unfortunately, fdopendir() takes ownership of the file descriptor. Therefore we need to 666 // make a duplicate. 667 int duped; 668 KJ_SYSCALL(duped = dup(fd)); 669 DIR* dir = fdopendir(duped); 670 if (dir == nullptr) { 671 close(duped); 672 KJ_FAIL_SYSCALL("fdopendir", errno); 673 } 674 675 KJ_DEFER(closedir(dir)); 676 typedef Decay<decltype(func(instance<StringPtr>(), instance<FsNode::Type>()))> Entry; 677 kj::Vector<Entry> entries; 678 679 for (;;) { 680 errno = 0; 681 struct dirent* entry = readdir(dir); 682 if (entry == nullptr) { 683 int error = errno; 684 if (error == 0) { 685 break; 686 } else { 687 KJ_FAIL_SYSCALL("readdir", error); 688 } 689 } 690 691 kj::StringPtr name = entry->d_name; 692 if (name != "." && name != ".." && !name.startsWith(HIDDEN_PREFIX)) { 693 #ifdef DT_UNKNOWN // d_type is not available on all platforms. 694 if (entry->d_type != DT_UNKNOWN) { 695 entries.add(func(name, modeToType(DTTOIF(entry->d_type)))); 696 } else { 697 #endif 698 if (needTypes) { 699 // Unknown type. Fall back to stat. 700 struct stat stats; 701 KJ_SYSCALL(fstatat(fd, name.cStr(), &stats, AT_SYMLINK_NOFOLLOW)); 702 entries.add(func(name, modeToType(stats.st_mode))); 703 } else { 704 entries.add(func(name, FsNode::Type::OTHER)); 705 } 706 #ifdef DT_UNKNOWN 707 } 708 #endif 709 } 710 } 711 712 auto result = entries.releaseAsArray(); 713 std::sort(result.begin(), result.end()); 714 return result; 715 } 716 717 Array<String> listNames() const { 718 return list(false, [](StringPtr name, FsNode::Type type) { return heapString(name); }); 719 } 720 721 Array<ReadableDirectory::Entry> listEntries() const { 722 return list(true, [](StringPtr name, FsNode::Type type) { 723 return ReadableDirectory::Entry { type, heapString(name), }; 724 }); 725 } 726 727 bool exists(PathPtr path) const { 728 KJ_SYSCALL_HANDLE_ERRORS(faccessat(fd, path.toString().cStr(), F_OK, 0)) { 729 case ENOENT: 730 case ENOTDIR: 731 return false; 732 default: 733 KJ_FAIL_SYSCALL("faccessat(fd, path)", error, path) { return false; } 734 } 735 return true; 736 } 737 738 Maybe<FsNode::Metadata> tryLstat(PathPtr path) const { 739 struct stat stats; 740 KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, path.toString().cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { 741 case ENOENT: 742 case ENOTDIR: 743 return nullptr; 744 default: 745 KJ_FAIL_SYSCALL("faccessat(fd, path)", error, path) { return nullptr; } 746 } 747 return statToMetadata(stats); 748 } 749 750 Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const { 751 int newFd; 752 KJ_SYSCALL_HANDLE_ERRORS(newFd = openat( 753 fd, path.toString().cStr(), O_RDONLY | MAYBE_O_CLOEXEC)) { 754 case ENOENT: 755 case ENOTDIR: 756 return nullptr; 757 default: 758 KJ_FAIL_SYSCALL("openat(fd, path, O_RDONLY)", error, path) { return nullptr; } 759 } 760 761 kj::AutoCloseFd result(newFd); 762 #ifndef O_CLOEXEC 763 setCloexec(result); 764 #endif 765 766 return newDiskReadableFile(kj::mv(result)); 767 } 768 769 Maybe<AutoCloseFd> tryOpenSubdirInternal(PathPtr path) const { 770 int newFd; 771 KJ_SYSCALL_HANDLE_ERRORS(newFd = openat( 772 fd, path.toString().cStr(), O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)) { 773 case ENOENT: 774 return nullptr; 775 case ENOTDIR: 776 // Could mean that a parent is not a directory, which we treat as "doesn't exist". 777 // Could also mean that the specified file is not a directory, which should throw. 778 // Check using exists(). 779 if (!exists(path)) { 780 return nullptr; 781 } 782 KJ_FALLTHROUGH; 783 default: 784 KJ_FAIL_SYSCALL("openat(fd, path, O_DIRECTORY)", error, path) { return nullptr; } 785 } 786 787 kj::AutoCloseFd result(newFd); 788 #ifndef O_CLOEXEC 789 setCloexec(result); 790 #endif 791 792 return kj::mv(result); 793 } 794 795 Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const { 796 return tryOpenSubdirInternal(path).map(newDiskReadableDirectory); 797 } 798 799 Maybe<String> tryReadlink(PathPtr path) const { 800 size_t trySize = 256; 801 for (;;) { 802 KJ_STACK_ARRAY(char, buf, trySize, 256, 4096); 803 ssize_t n = readlinkat(fd, path.toString().cStr(), buf.begin(), buf.size()); 804 if (n < 0) { 805 int error = errno; 806 switch (error) { 807 case EINTR: 808 continue; 809 case ENOENT: 810 case ENOTDIR: 811 case EINVAL: // not a link 812 return nullptr; 813 default: 814 KJ_FAIL_SYSCALL("readlinkat(fd, path)", error, path) { return nullptr; } 815 } 816 } 817 818 if (n >= buf.size()) { 819 // Didn't give it enough space. Better retry with a bigger buffer. 820 trySize *= 2; 821 continue; 822 } 823 824 return heapString(buf.begin(), n); 825 } 826 } 827 828 // Directory ----------------------------------------------------------------- 829 830 bool tryMkdir(PathPtr path, WriteMode mode, bool noThrow) const { 831 // Internal function to make a directory. 832 833 auto filename = path.toString(); 834 mode_t acl = has(mode, WriteMode::PRIVATE) ? 0700 : 0777; 835 836 KJ_SYSCALL_HANDLE_ERRORS(mkdirat(fd, filename.cStr(), acl)) { 837 case EEXIST: { 838 // Apparently this path exists. 839 if (!has(mode, WriteMode::MODIFY)) { 840 // Require exclusive create. 841 return false; 842 } 843 844 // MODIFY is allowed, so we just need to check whether the existing entry is a directory. 845 struct stat stats; 846 KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, filename.cStr(), &stats, 0)) { 847 default: 848 // mkdir() says EEXIST but we can't stat it. Maybe it's a dangling link, or maybe 849 // we can't access it for some reason. Assume failure. 850 // 851 // TODO(someday): Maybe we should be creating the directory at the target of the 852 // link? 853 goto failed; 854 } 855 return (stats.st_mode & S_IFMT) == S_IFDIR; 856 } 857 case ENOENT: 858 if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 && 859 tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY | 860 WriteMode::CREATE_PARENT, true)) { 861 // Retry, but make sure we don't try to create the parent again. 862 return tryMkdir(path, mode - WriteMode::CREATE_PARENT, noThrow); 863 } else { 864 goto failed; 865 } 866 default: 867 failed: 868 if (noThrow) { 869 // Caller requested no throwing. 870 return false; 871 } else { 872 KJ_FAIL_SYSCALL("mkdirat(fd, path)", error, path); 873 } 874 } 875 876 return true; 877 } 878 879 kj::Maybe<String> createNamedTemporary( 880 PathPtr finalName, WriteMode mode, Function<int(StringPtr)> tryCreate) const { 881 // Create a temporary file which will eventually replace `finalName`. 882 // 883 // Calls `tryCreate` to actually create the temporary, passing in the desired path. tryCreate() 884 // is expected to behave like a syscall, returning a negative value and setting `errno` on 885 // error. tryCreate() MUST fail with EEXIST if the path exists -- this is not checked in 886 // advance, since it needs to be checked atomically. In the case of EEXIST, tryCreate() will 887 // be called again with a new path. 888 // 889 // Returns the temporary path that succeeded. Only returns nullptr if there was an exception 890 // but we're compiled with -fno-exceptions. 891 892 if (finalName.size() == 0) { 893 KJ_FAIL_REQUIRE("can't replace self") { break; } 894 return nullptr; 895 } 896 897 static uint counter = 0; 898 static const pid_t pid = getpid(); 899 String pathPrefix; 900 if (finalName.size() > 1) { 901 pathPrefix = kj::str(finalName.parent(), '/'); 902 } 903 auto path = kj::str(pathPrefix, HIDDEN_PREFIX, pid, '.', counter++, '.', 904 finalName.basename()[0], ".partial"); 905 906 KJ_SYSCALL_HANDLE_ERRORS(tryCreate(path)) { 907 case EEXIST: 908 return createNamedTemporary(finalName, mode, kj::mv(tryCreate)); 909 case ENOENT: 910 if (has(mode, WriteMode::CREATE_PARENT) && finalName.size() > 1 && 911 tryMkdir(finalName.parent(), WriteMode::CREATE | WriteMode::MODIFY | 912 WriteMode::CREATE_PARENT, true)) { 913 // Retry, but make sure we don't try to create the parent again. 914 mode = mode - WriteMode::CREATE_PARENT; 915 return createNamedTemporary(finalName, mode, kj::mv(tryCreate)); 916 } 917 KJ_FALLTHROUGH; 918 default: 919 KJ_FAIL_SYSCALL("create(path)", error, path) { break; } 920 return nullptr; 921 } 922 923 return kj::mv(path); 924 } 925 926 bool tryReplaceNode(PathPtr path, WriteMode mode, Function<int(StringPtr)> tryCreate) const { 927 // Replaces the given path with an object created by calling tryCreate(). 928 // 929 // tryCreate() must behave like a syscall which creates the node at the path passed to it, 930 // returning a negative value on error. If the path passed to tryCreate already exists, it 931 // MUST fail with EEXIST. 932 // 933 // When `mode` includes MODIFY, replaceNode() reacts to EEXIST by creating the node in a 934 // temporary location and then rename()ing it into place. 935 936 if (path.size() == 0) { 937 KJ_FAIL_REQUIRE("can't replace self") { return false; } 938 } 939 940 auto filename = path.toString(); 941 942 if (has(mode, WriteMode::CREATE)) { 943 // First try just cerating the node in-place. 944 KJ_SYSCALL_HANDLE_ERRORS(tryCreate(filename)) { 945 case EEXIST: 946 // Target exists. 947 if (has(mode, WriteMode::MODIFY)) { 948 // Fall back to MODIFY path, below. 949 break; 950 } else { 951 return false; 952 } 953 case ENOENT: 954 if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 && 955 tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY | 956 WriteMode::CREATE_PARENT, true)) { 957 // Retry, but make sure we don't try to create the parent again. 958 return tryReplaceNode(path, mode - WriteMode::CREATE_PARENT, kj::mv(tryCreate)); 959 } 960 KJ_FALLTHROUGH; 961 default: 962 KJ_FAIL_SYSCALL("create(path)", error, path) { return false; } 963 } else { 964 // Success. 965 return true; 966 } 967 } 968 969 // Either we don't have CREATE mode or the target already exists. We need to perform a 970 // replacement instead. 971 972 KJ_IF_MAYBE(tempPath, createNamedTemporary(path, mode, kj::mv(tryCreate))) { 973 if (tryCommitReplacement(filename, fd, *tempPath, mode)) { 974 return true; 975 } else { 976 KJ_SYSCALL_HANDLE_ERRORS(unlinkat(fd, tempPath->cStr(), 0)) { 977 case ENOENT: 978 // meh 979 break; 980 default: 981 KJ_FAIL_SYSCALL("unlinkat(fd, tempPath, 0)", error, *tempPath); 982 } 983 return false; 984 } 985 } else { 986 // threw, but exceptions are disabled 987 return false; 988 } 989 } 990 991 Maybe<AutoCloseFd> tryOpenFileInternal(PathPtr path, WriteMode mode, bool append) const { 992 uint flags = O_RDWR | MAYBE_O_CLOEXEC; 993 mode_t acl = 0666; 994 if (has(mode, WriteMode::CREATE)) { 995 flags |= O_CREAT; 996 } 997 if (!has(mode, WriteMode::MODIFY)) { 998 if (!has(mode, WriteMode::CREATE)) { 999 // Neither CREATE nor MODIFY -- impossible to satisfy preconditions. 1000 return nullptr; 1001 } 1002 flags |= O_EXCL; 1003 } 1004 if (append) { 1005 flags |= O_APPEND; 1006 } 1007 if (has(mode, WriteMode::EXECUTABLE)) { 1008 acl = 0777; 1009 } 1010 if (has(mode, WriteMode::PRIVATE)) { 1011 acl &= 0700; 1012 } 1013 1014 auto filename = path.toString(); 1015 1016 int newFd; 1017 KJ_SYSCALL_HANDLE_ERRORS(newFd = openat(fd, filename.cStr(), flags, acl)) { 1018 case ENOENT: 1019 if (has(mode, WriteMode::CREATE)) { 1020 // Either: 1021 // - The file is a broken symlink. 1022 // - A parent directory didn't exist. 1023 if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 && 1024 tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY | 1025 WriteMode::CREATE_PARENT, true)) { 1026 // Retry, but make sure we don't try to create the parent again. 1027 return tryOpenFileInternal(path, mode - WriteMode::CREATE_PARENT, append); 1028 } 1029 1030 // Check for broken link. 1031 if (!has(mode, WriteMode::MODIFY) && 1032 faccessat(fd, filename.cStr(), F_OK, AT_SYMLINK_NOFOLLOW) >= 0) { 1033 // Yep. We treat this as already-exists, which means in CREATE-only mode this is a 1034 // simple failure. 1035 return nullptr; 1036 } 1037 1038 KJ_FAIL_REQUIRE("parent is not a directory", path) { return nullptr; } 1039 } else { 1040 // MODIFY-only mode. ENOENT = doesn't exist = return null. 1041 return nullptr; 1042 } 1043 case ENOTDIR: 1044 if (!has(mode, WriteMode::CREATE)) { 1045 // MODIFY-only mode. ENOTDIR = parent not a directory = doesn't exist = return null. 1046 return nullptr; 1047 } 1048 goto failed; 1049 case EEXIST: 1050 if (!has(mode, WriteMode::MODIFY)) { 1051 // CREATE-only mode. EEXIST = already exists = return null. 1052 return nullptr; 1053 } 1054 goto failed; 1055 default: 1056 failed: 1057 KJ_FAIL_SYSCALL("openat(fd, path, O_RDWR | ...)", error, path) { return nullptr; } 1058 } 1059 1060 kj::AutoCloseFd result(newFd); 1061 #ifndef O_CLOEXEC 1062 setCloexec(result); 1063 #endif 1064 1065 return kj::mv(result); 1066 } 1067 1068 bool tryCommitReplacement(StringPtr toPath, int fromDirFd, StringPtr fromPath, WriteMode mode, 1069 int* errorReason = nullptr) const { 1070 if (has(mode, WriteMode::CREATE) && has(mode, WriteMode::MODIFY)) { 1071 // Always clobber. Try it. 1072 KJ_SYSCALL_HANDLE_ERRORS(renameat(fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr())) { 1073 case EISDIR: 1074 case ENOTDIR: 1075 case ENOTEMPTY: 1076 case EEXIST: 1077 // Failed because target exists and due to the various weird quirks of rename(), it 1078 // can't remove it for us. On Linux we can try an exchange instead. On others we have 1079 // to move the target out of the way. 1080 break; 1081 default: 1082 if (errorReason == nullptr) { 1083 KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) { return false; } 1084 } else { 1085 *errorReason = error; 1086 return false; 1087 } 1088 } else { 1089 return true; 1090 } 1091 } 1092 1093 #if __linux__ && defined(RENAME_EXCHANGE) 1094 // Try to use Linux's renameat2() to atomically check preconditions and apply. 1095 1096 if (has(mode, WriteMode::MODIFY)) { 1097 // Use an exchange to implement modification. 1098 // 1099 // We reach this branch when performing a MODIFY-only, or when performing a CREATE | MODIFY 1100 // in which we determined above that there's a node of a different type blocking the 1101 // exchange. 1102 1103 KJ_SYSCALL_HANDLE_ERRORS(syscall(SYS_renameat2, 1104 fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr(), RENAME_EXCHANGE)) { 1105 case ENOSYS: // Syscall not supported by kernel. 1106 case EINVAL: // Maybe we screwed up, or maybe the syscall is not supported by the 1107 // filesystem. Unfortunately, there's no way to tell, so assume the latter. 1108 // ZFS in particular apparently produces EINVAL. 1109 break; // fall back to traditional means 1110 case ENOENT: 1111 // Presumably because the target path doesn't exist. 1112 if (has(mode, WriteMode::CREATE)) { 1113 KJ_FAIL_ASSERT("rename(tmp, path) claimed path exists but " 1114 "renameat2(fromPath, toPath, EXCAHNGE) said it doest; concurrent modification?", 1115 fromPath, toPath) { return false; } 1116 } else { 1117 // Assume target doesn't exist. 1118 return false; 1119 } 1120 default: 1121 if (errorReason == nullptr) { 1122 KJ_FAIL_SYSCALL("renameat2(fromPath, toPath, EXCHANGE)", error, fromPath, toPath) { 1123 return false; 1124 } 1125 } else { 1126 *errorReason = error; 1127 return false; 1128 } 1129 } else { 1130 // Successful swap! Delete swapped-out content. 1131 rmrf(fromDirFd, fromPath); 1132 return true; 1133 } 1134 } else if (has(mode, WriteMode::CREATE)) { 1135 KJ_SYSCALL_HANDLE_ERRORS(syscall(SYS_renameat2, 1136 fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr(), RENAME_NOREPLACE)) { 1137 case ENOSYS: // Syscall not supported by kernel. 1138 case EINVAL: // Maybe we screwed up, or maybe the syscall is not supported by the 1139 // filesystem. Unfortunately, there's no way to tell, so assume the latter. 1140 // ZFS in particular apparently produces EINVAL. 1141 break; // fall back to traditional means 1142 case EEXIST: 1143 return false; 1144 default: 1145 if (errorReason == nullptr) { 1146 KJ_FAIL_SYSCALL("renameat2(fromPath, toPath, NOREPLACE)", error, fromPath, toPath) { 1147 return false; 1148 } 1149 } else { 1150 *errorReason = error; 1151 return false; 1152 } 1153 } else { 1154 return true; 1155 } 1156 } 1157 #endif 1158 1159 // We're unable to do what we wanted atomically. :( 1160 1161 if (has(mode, WriteMode::CREATE) && has(mode, WriteMode::MODIFY)) { 1162 // We failed to atomically delete the target previously. So now we need to do two calls in 1163 // rapid succession to move the old file away then move the new one into place. 1164 1165 // Find out what kind of file exists at the target path. 1166 struct stat stats; 1167 KJ_SYSCALL(fstatat(fd, toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { return false; } 1168 1169 // Create a temporary location to move the existing object to. Note that rename() allows a 1170 // non-directory to replace a non-directory, and allows a directory to replace an empty 1171 // directory. So we have to create the right type. 1172 Path toPathParsed = Path::parse(toPath); 1173 String away; 1174 KJ_IF_MAYBE(awayPath, createNamedTemporary(toPathParsed, WriteMode::CREATE, 1175 [&](StringPtr candidatePath) { 1176 if (S_ISDIR(stats.st_mode)) { 1177 return mkdirat(fd, candidatePath.cStr(), 0700); 1178 } else { 1179 #if __APPLE__ || __FreeBSD__ 1180 // - No mknodat() on OSX, gotta open() a file, ugh. 1181 // - On a modern FreeBSD, mknodat() is reserved strictly for device nodes, 1182 // you cannot create a regular file using it (EINVAL). 1183 int newFd = openat(fd, candidatePath.cStr(), 1184 O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, 0700); 1185 if (newFd >= 0) close(newFd); 1186 return newFd; 1187 #else 1188 return mknodat(fd, candidatePath.cStr(), S_IFREG | 0600, dev_t()); 1189 #endif 1190 } 1191 })) { 1192 away = kj::mv(*awayPath); 1193 } else { 1194 // Already threw. 1195 return false; 1196 } 1197 1198 // OK, now move the target object to replace the thing we just created. 1199 KJ_SYSCALL(renameat(fd, toPath.cStr(), fd, away.cStr())) { 1200 // Something went wrong. Remove the thing we just created. 1201 unlinkat(fd, away.cStr(), S_ISDIR(stats.st_mode) ? AT_REMOVEDIR : 0); 1202 return false; 1203 } 1204 1205 // Now move the source object to the target location. 1206 KJ_SYSCALL_HANDLE_ERRORS(renameat(fromDirFd, fromPath.cStr(), fd, toPath.cStr())) { 1207 default: 1208 // Try to put things back where they were. If this fails, though, then we have little 1209 // choice but to leave things broken. 1210 KJ_SYSCALL_HANDLE_ERRORS(renameat(fd, away.cStr(), fd, toPath.cStr())) { 1211 default: break; 1212 } 1213 1214 if (errorReason == nullptr) { 1215 KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) { 1216 return false; 1217 } 1218 } else { 1219 *errorReason = error; 1220 return false; 1221 } 1222 } 1223 1224 // OK, success. Delete the old content. 1225 rmrf(fd, away); 1226 return true; 1227 } else { 1228 // Only one of CREATE or MODIFY is specified, so we need to verify non-atomically that the 1229 // corresponding precondition (must-not-exist or must-exist, respectively) is held. 1230 if (has(mode, WriteMode::CREATE)) { 1231 struct stat stats; 1232 KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd.get(), toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { 1233 case ENOENT: 1234 case ENOTDIR: 1235 break; // doesn't exist; continue 1236 default: 1237 KJ_FAIL_SYSCALL("fstatat(fd, toPath)", error, toPath) { return false; } 1238 } else { 1239 return false; // already exists; fail 1240 } 1241 } else if (has(mode, WriteMode::MODIFY)) { 1242 struct stat stats; 1243 KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd.get(), toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { 1244 case ENOENT: 1245 case ENOTDIR: 1246 return false; // doesn't exist; fail 1247 default: 1248 KJ_FAIL_SYSCALL("fstatat(fd, toPath)", error, toPath) { return false; } 1249 } else { 1250 // already exists; continue 1251 } 1252 } else { 1253 // Neither CREATE nor MODIFY. 1254 return false; 1255 } 1256 1257 // Start over in create-and-modify mode. 1258 return tryCommitReplacement(toPath, fromDirFd, fromPath, 1259 WriteMode::CREATE | WriteMode::MODIFY, 1260 errorReason); 1261 } 1262 } 1263 1264 template <typename T> 1265 class ReplacerImpl final: public Directory::Replacer<T> { 1266 public: 1267 ReplacerImpl(Own<const T>&& object, const DiskHandle& handle, 1268 String&& tempPath, String&& path, WriteMode mode) 1269 : Directory::Replacer<T>(mode), 1270 object(kj::mv(object)), handle(handle), 1271 tempPath(kj::mv(tempPath)), path(kj::mv(path)) {} 1272 1273 ~ReplacerImpl() noexcept(false) { 1274 if (!committed) { 1275 rmrf(handle.fd, tempPath); 1276 } 1277 } 1278 1279 const T& get() override { 1280 return *object; 1281 } 1282 1283 bool tryCommit() override { 1284 KJ_ASSERT(!committed, "already committed") { return false; } 1285 return committed = handle.tryCommitReplacement(path, handle.fd, tempPath, 1286 Directory::Replacer<T>::mode); 1287 } 1288 1289 private: 1290 Own<const T> object; 1291 const DiskHandle& handle; 1292 String tempPath; 1293 String path; 1294 bool committed = false; // true if *successfully* committed (in which case tempPath is gone) 1295 }; 1296 1297 template <typename T> 1298 class BrokenReplacer final: public Directory::Replacer<T> { 1299 // For recovery path when exceptions are disabled. 1300 1301 public: 1302 BrokenReplacer(Own<const T> inner) 1303 : Directory::Replacer<T>(WriteMode::CREATE | WriteMode::MODIFY), 1304 inner(kj::mv(inner)) {} 1305 1306 const T& get() override { return *inner; } 1307 bool tryCommit() override { return false; } 1308 1309 private: 1310 Own<const T> inner; 1311 }; 1312 1313 Maybe<Own<const File>> tryOpenFile(PathPtr path, WriteMode mode) const { 1314 return tryOpenFileInternal(path, mode, false).map(newDiskFile); 1315 } 1316 1317 Own<Directory::Replacer<File>> replaceFile(PathPtr path, WriteMode mode) const { 1318 mode_t acl = 0666; 1319 if (has(mode, WriteMode::EXECUTABLE)) { 1320 acl = 0777; 1321 } 1322 if (has(mode, WriteMode::PRIVATE)) { 1323 acl &= 0700; 1324 } 1325 1326 int newFd_; 1327 KJ_IF_MAYBE(temp, createNamedTemporary(path, mode, 1328 [&](StringPtr candidatePath) { 1329 return newFd_ = openat(fd, candidatePath.cStr(), 1330 O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, acl); 1331 })) { 1332 AutoCloseFd newFd(newFd_); 1333 #ifndef O_CLOEXEC 1334 setCloexec(newFd); 1335 #endif 1336 return heap<ReplacerImpl<File>>(newDiskFile(kj::mv(newFd)), *this, kj::mv(*temp), 1337 path.toString(), mode); 1338 } else { 1339 // threw, but exceptions are disabled 1340 return heap<BrokenReplacer<File>>(newInMemoryFile(nullClock())); 1341 } 1342 } 1343 1344 Own<const File> createTemporary() const { 1345 int newFd_; 1346 1347 #if __linux__ && defined(O_TMPFILE) 1348 // Use syscall() to work around glibc bug with O_TMPFILE: 1349 // https://sourceware.org/bugzilla/show_bug.cgi?id=17523 1350 KJ_SYSCALL_HANDLE_ERRORS(newFd_ = syscall( 1351 SYS_openat, fd.get(), ".", O_RDWR | O_TMPFILE, 0700)) { 1352 case EOPNOTSUPP: 1353 case EINVAL: 1354 case EISDIR: 1355 // Maybe not supported by this kernel / filesystem. Fall back to below. 1356 break; 1357 default: 1358 KJ_FAIL_SYSCALL("open(O_TMPFILE)", error) { break; } 1359 break; 1360 } else { 1361 AutoCloseFd newFd(newFd_); 1362 #ifndef O_CLOEXEC 1363 setCloexec(newFd); 1364 #endif 1365 return newDiskFile(kj::mv(newFd)); 1366 } 1367 #endif 1368 1369 KJ_IF_MAYBE(temp, createNamedTemporary(Path("unnamed"), WriteMode::CREATE, 1370 [&](StringPtr path) { 1371 return newFd_ = openat(fd, path.cStr(), O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, 0600); 1372 })) { 1373 AutoCloseFd newFd(newFd_); 1374 #ifndef O_CLOEXEC 1375 setCloexec(newFd); 1376 #endif 1377 auto result = newDiskFile(kj::mv(newFd)); 1378 KJ_SYSCALL(unlinkat(fd, temp->cStr(), 0)) { break; } 1379 return kj::mv(result); 1380 } else { 1381 // threw, but exceptions are disabled 1382 return newInMemoryFile(nullClock()); 1383 } 1384 } 1385 1386 Maybe<Own<AppendableFile>> tryAppendFile(PathPtr path, WriteMode mode) const { 1387 return tryOpenFileInternal(path, mode, true).map(newDiskAppendableFile); 1388 } 1389 1390 Maybe<Own<const Directory>> tryOpenSubdir(PathPtr path, WriteMode mode) const { 1391 // Must create before open. 1392 if (has(mode, WriteMode::CREATE)) { 1393 if (!tryMkdir(path, mode, false)) return nullptr; 1394 } 1395 1396 return tryOpenSubdirInternal(path).map(newDiskDirectory); 1397 } 1398 1399 Own<Directory::Replacer<Directory>> replaceSubdir(PathPtr path, WriteMode mode) const { 1400 mode_t acl = has(mode, WriteMode::PRIVATE) ? 0700 : 0777; 1401 1402 KJ_IF_MAYBE(temp, createNamedTemporary(path, mode, 1403 [&](StringPtr candidatePath) { 1404 return mkdirat(fd, candidatePath.cStr(), acl); 1405 })) { 1406 int subdirFd_; 1407 KJ_SYSCALL_HANDLE_ERRORS(subdirFd_ = openat( 1408 fd, temp->cStr(), O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)) { 1409 default: 1410 KJ_FAIL_SYSCALL("open(just-created-temporary)", error); 1411 return heap<BrokenReplacer<Directory>>(newInMemoryDirectory(nullClock())); 1412 } 1413 1414 AutoCloseFd subdirFd(subdirFd_); 1415 #ifndef O_CLOEXEC 1416 setCloexec(subdirFd); 1417 #endif 1418 return heap<ReplacerImpl<Directory>>( 1419 newDiskDirectory(kj::mv(subdirFd)), *this, kj::mv(*temp), path.toString(), mode); 1420 } else { 1421 // threw, but exceptions are disabled 1422 return heap<BrokenReplacer<Directory>>(newInMemoryDirectory(nullClock())); 1423 } 1424 } 1425 1426 bool trySymlink(PathPtr linkpath, StringPtr content, WriteMode mode) const { 1427 return tryReplaceNode(linkpath, mode, [&](StringPtr candidatePath) { 1428 return symlinkat(content.cStr(), fd, candidatePath.cStr()); 1429 }); 1430 } 1431 1432 bool tryTransfer(PathPtr toPath, WriteMode toMode, 1433 const Directory& fromDirectory, PathPtr fromPath, 1434 TransferMode mode, const Directory& self) const { 1435 KJ_REQUIRE(toPath.size() > 0, "can't replace self") { return false; } 1436 1437 if (mode == TransferMode::LINK) { 1438 KJ_IF_MAYBE(fromFd, fromDirectory.getFd()) { 1439 // Other is a disk directory, so we can hopefully do an efficient move/link. 1440 return tryReplaceNode(toPath, toMode, [&](StringPtr candidatePath) { 1441 return linkat(*fromFd, fromPath.toString().cStr(), fd, candidatePath.cStr(), 0); 1442 }); 1443 }; 1444 } else if (mode == TransferMode::MOVE) { 1445 KJ_IF_MAYBE(fromFd, fromDirectory.getFd()) { 1446 KJ_ASSERT(mode == TransferMode::MOVE); 1447 1448 int error = 0; 1449 if (tryCommitReplacement(toPath.toString(), *fromFd, fromPath.toString(), toMode, 1450 &error)) { 1451 return true; 1452 } else switch (error) { 1453 case 0: 1454 // Plain old WriteMode precondition failure. 1455 return false; 1456 case EXDEV: 1457 // Can't move between devices. Fall back to default implementation, which does 1458 // copy/delete. 1459 break; 1460 case ENOENT: 1461 // Either the destination directory doesn't exist or the source path doesn't exist. 1462 // Unfortunately we don't really know. If CREATE_PARENT was provided, try creating 1463 // the parent directory. Otherwise, we don't actually need to distinguish between 1464 // these two errors; just return false. 1465 if (has(toMode, WriteMode::CREATE) && has(toMode, WriteMode::CREATE_PARENT) && 1466 toPath.size() > 0 && tryMkdir(toPath.parent(), 1467 WriteMode::CREATE | WriteMode::MODIFY | WriteMode::CREATE_PARENT, true)) { 1468 // Retry, but make sure we don't try to create the parent again. 1469 return tryTransfer(toPath, toMode - WriteMode::CREATE_PARENT, 1470 fromDirectory, fromPath, mode, self); 1471 } 1472 return false; 1473 default: 1474 KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) { 1475 return false; 1476 } 1477 } 1478 } 1479 } 1480 1481 // OK, we can't do anything efficient using the OS. Fall back to default implementation. 1482 return self.Directory::tryTransfer(toPath, toMode, fromDirectory, fromPath, mode); 1483 } 1484 1485 bool tryRemove(PathPtr path) const { 1486 return rmrf(fd, path.toString()); 1487 } 1488 1489 protected: 1490 AutoCloseFd fd; 1491 }; 1492 1493 #define FSNODE_METHODS(classname) \ 1494 Maybe<int> getFd() const override { return DiskHandle::getFd(); } \ 1495 \ 1496 Own<const FsNode> cloneFsNode() const override { \ 1497 return heap<classname>(DiskHandle::clone()); \ 1498 } \ 1499 \ 1500 Metadata stat() const override { return DiskHandle::stat(); } \ 1501 void sync() const override { DiskHandle::sync(); } \ 1502 void datasync() const override { DiskHandle::datasync(); } 1503 1504 class DiskReadableFile final: public ReadableFile, public DiskHandle { 1505 public: 1506 DiskReadableFile(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {} 1507 1508 FSNODE_METHODS(DiskReadableFile); 1509 1510 size_t read(uint64_t offset, ArrayPtr<byte> buffer) const override { 1511 return DiskHandle::read(offset, buffer); 1512 } 1513 Array<const byte> mmap(uint64_t offset, uint64_t size) const override { 1514 return DiskHandle::mmap(offset, size); 1515 } 1516 Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const override { 1517 return DiskHandle::mmapPrivate(offset, size); 1518 } 1519 }; 1520 1521 class DiskAppendableFile final: public AppendableFile, public DiskHandle, public FdOutputStream { 1522 public: 1523 DiskAppendableFile(AutoCloseFd&& fd) 1524 : DiskHandle(kj::mv(fd)), 1525 FdOutputStream(DiskHandle::fd.get()) {} 1526 1527 FSNODE_METHODS(DiskAppendableFile); 1528 1529 void write(const void* buffer, size_t size) override { 1530 FdOutputStream::write(buffer, size); 1531 } 1532 void write(ArrayPtr<const ArrayPtr<const byte>> pieces) override { 1533 FdOutputStream::write(pieces); 1534 } 1535 }; 1536 1537 class DiskFile final: public File, public DiskHandle { 1538 public: 1539 DiskFile(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {} 1540 1541 FSNODE_METHODS(DiskFile); 1542 1543 size_t read(uint64_t offset, ArrayPtr<byte> buffer) const override { 1544 return DiskHandle::read(offset, buffer); 1545 } 1546 Array<const byte> mmap(uint64_t offset, uint64_t size) const override { 1547 return DiskHandle::mmap(offset, size); 1548 } 1549 Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const override { 1550 return DiskHandle::mmapPrivate(offset, size); 1551 } 1552 1553 void write(uint64_t offset, ArrayPtr<const byte> data) const override { 1554 DiskHandle::write(offset, data); 1555 } 1556 void zero(uint64_t offset, uint64_t size) const override { 1557 DiskHandle::zero(offset, size); 1558 } 1559 void truncate(uint64_t size) const override { 1560 DiskHandle::truncate(size); 1561 } 1562 Own<const WritableFileMapping> mmapWritable(uint64_t offset, uint64_t size) const override { 1563 return DiskHandle::mmapWritable(offset, size); 1564 } 1565 size_t copy(uint64_t offset, const ReadableFile& from, 1566 uint64_t fromOffset, uint64_t size) const override { 1567 KJ_IF_MAYBE(result, DiskHandle::copy(offset, from, fromOffset, size)) { 1568 return *result; 1569 } else { 1570 return File::copy(offset, from, fromOffset, size); 1571 } 1572 } 1573 }; 1574 1575 class DiskReadableDirectory final: public ReadableDirectory, public DiskHandle { 1576 public: 1577 DiskReadableDirectory(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {} 1578 1579 FSNODE_METHODS(DiskReadableDirectory); 1580 1581 Array<String> listNames() const override { return DiskHandle::listNames(); } 1582 Array<Entry> listEntries() const override { return DiskHandle::listEntries(); } 1583 bool exists(PathPtr path) const override { return DiskHandle::exists(path); } 1584 Maybe<FsNode::Metadata> tryLstat(PathPtr path) const override { 1585 return DiskHandle::tryLstat(path); 1586 } 1587 Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const override { 1588 return DiskHandle::tryOpenFile(path); 1589 } 1590 Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const override { 1591 return DiskHandle::tryOpenSubdir(path); 1592 } 1593 Maybe<String> tryReadlink(PathPtr path) const override { return DiskHandle::tryReadlink(path); } 1594 }; 1595 1596 class DiskDirectory final: public Directory, public DiskHandle { 1597 public: 1598 DiskDirectory(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {} 1599 1600 FSNODE_METHODS(DiskDirectory); 1601 1602 Array<String> listNames() const override { return DiskHandle::listNames(); } 1603 Array<Entry> listEntries() const override { return DiskHandle::listEntries(); } 1604 bool exists(PathPtr path) const override { return DiskHandle::exists(path); } 1605 Maybe<FsNode::Metadata> tryLstat(PathPtr path) const override { 1606 return DiskHandle::tryLstat(path); 1607 } 1608 Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const override { 1609 return DiskHandle::tryOpenFile(path); 1610 } 1611 Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const override { 1612 return DiskHandle::tryOpenSubdir(path); 1613 } 1614 Maybe<String> tryReadlink(PathPtr path) const override { return DiskHandle::tryReadlink(path); } 1615 1616 Maybe<Own<const File>> tryOpenFile(PathPtr path, WriteMode mode) const override { 1617 return DiskHandle::tryOpenFile(path, mode); 1618 } 1619 Own<Replacer<File>> replaceFile(PathPtr path, WriteMode mode) const override { 1620 return DiskHandle::replaceFile(path, mode); 1621 } 1622 Own<const File> createTemporary() const override { 1623 return DiskHandle::createTemporary(); 1624 } 1625 Maybe<Own<AppendableFile>> tryAppendFile(PathPtr path, WriteMode mode) const override { 1626 return DiskHandle::tryAppendFile(path, mode); 1627 } 1628 Maybe<Own<const Directory>> tryOpenSubdir(PathPtr path, WriteMode mode) const override { 1629 return DiskHandle::tryOpenSubdir(path, mode); 1630 } 1631 Own<Replacer<Directory>> replaceSubdir(PathPtr path, WriteMode mode) const override { 1632 return DiskHandle::replaceSubdir(path, mode); 1633 } 1634 bool trySymlink(PathPtr linkpath, StringPtr content, WriteMode mode) const override { 1635 return DiskHandle::trySymlink(linkpath, content, mode); 1636 } 1637 bool tryTransfer(PathPtr toPath, WriteMode toMode, 1638 const Directory& fromDirectory, PathPtr fromPath, 1639 TransferMode mode) const override { 1640 return DiskHandle::tryTransfer(toPath, toMode, fromDirectory, fromPath, mode, *this); 1641 } 1642 // tryTransferTo() not implemented because we have nothing special we can do. 1643 bool tryRemove(PathPtr path) const override { 1644 return DiskHandle::tryRemove(path); 1645 } 1646 }; 1647 1648 class DiskFilesystem final: public Filesystem { 1649 public: 1650 DiskFilesystem() 1651 : root(openDir("/")), 1652 current(openDir(".")), 1653 currentPath(computeCurrentPath()) {} 1654 1655 const Directory& getRoot() const override { 1656 return root; 1657 } 1658 1659 const Directory& getCurrent() const override { 1660 return current; 1661 } 1662 1663 PathPtr getCurrentPath() const override { 1664 return currentPath; 1665 } 1666 1667 private: 1668 DiskDirectory root; 1669 DiskDirectory current; 1670 Path currentPath; 1671 1672 static AutoCloseFd openDir(const char* dir) { 1673 int newFd; 1674 KJ_SYSCALL(newFd = open(dir, O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)); 1675 AutoCloseFd result(newFd); 1676 #ifndef O_CLOEXEC 1677 setCloexec(result); 1678 #endif 1679 return result; 1680 } 1681 1682 static Path computeCurrentPath() { 1683 // If env var PWD is set and points to the current directory, use it. This captures the current 1684 // path according to the user's shell, which may differ from the kernel's idea in the presence 1685 // of symlinks. 1686 const char* pwd = getenv("PWD"); 1687 if (pwd != nullptr) { 1688 Path result = nullptr; 1689 struct stat pwdStat, dotStat; 1690 KJ_IF_MAYBE(e, kj::runCatchingExceptions([&]() { 1691 KJ_ASSERT(pwd[0] == '/') { return; } 1692 result = Path::parse(pwd + 1); 1693 KJ_SYSCALL(lstat(result.toString(true).cStr(), &pwdStat), result) { return; } 1694 KJ_SYSCALL(lstat(".", &dotStat)) { return; } 1695 })) { 1696 // failed, give up on PWD 1697 KJ_LOG(WARNING, "PWD environment variable seems invalid", pwd, *e); 1698 } else { 1699 if (pwdStat.st_ino == dotStat.st_ino && 1700 pwdStat.st_dev == dotStat.st_dev) { 1701 return kj::mv(result); 1702 } else { 1703 KJ_LOG(WARNING, "PWD environment variable doesn't match current directory", pwd); 1704 } 1705 } 1706 } 1707 1708 size_t size = 256; 1709 retry: 1710 KJ_STACK_ARRAY(char, buf, size, 256, 4096); 1711 if (getcwd(buf.begin(), size) == nullptr) { 1712 int error = errno; 1713 if (error == ENAMETOOLONG) { 1714 size *= 2; 1715 goto retry; 1716 } else { 1717 KJ_FAIL_SYSCALL("getcwd()", error); 1718 } 1719 } 1720 1721 StringPtr path = buf.begin(); 1722 1723 // On Linux, the path will start with "(unreachable)" if the working directory is not a subdir 1724 // of the root directory, which is possible via chroot() or mount namespaces. 1725 KJ_ASSERT(!path.startsWith("(unreachable)"), 1726 "working directory is not reachable from root", path); 1727 KJ_ASSERT(path.startsWith("/"), "current directory is not absolute", path); 1728 1729 return Path::parse(path.slice(1)); 1730 } 1731 }; 1732 1733 } // namespace 1734 1735 Own<ReadableFile> newDiskReadableFile(kj::AutoCloseFd fd) { 1736 return heap<DiskReadableFile>(kj::mv(fd)); 1737 } 1738 Own<AppendableFile> newDiskAppendableFile(kj::AutoCloseFd fd) { 1739 return heap<DiskAppendableFile>(kj::mv(fd)); 1740 } 1741 Own<File> newDiskFile(kj::AutoCloseFd fd) { 1742 return heap<DiskFile>(kj::mv(fd)); 1743 } 1744 Own<ReadableDirectory> newDiskReadableDirectory(kj::AutoCloseFd fd) { 1745 return heap<DiskReadableDirectory>(kj::mv(fd)); 1746 } 1747 Own<Directory> newDiskDirectory(kj::AutoCloseFd fd) { 1748 return heap<DiskDirectory>(kj::mv(fd)); 1749 } 1750 1751 Own<Filesystem> newDiskFilesystem() { 1752 return heap<DiskFilesystem>(); 1753 } 1754 1755 } // namespace kj 1756 1757 #endif // !_WIN32