capnproto

FORK: Cap'n Proto serialization/RPC system - core tools and C++ library
git clone https://git.neptards.moe/neptards/capnproto.git
Log | Files | Refs | README | LICENSE

filesystem-disk-unix.c++ (60907B)


      1 // Copyright (c) 2015 Sandstorm Development Group, Inc. and contributors
      2 // Licensed under the MIT License:
      3 //
      4 // Permission is hereby granted, free of charge, to any person obtaining a copy
      5 // of this software and associated documentation files (the "Software"), to deal
      6 // in the Software without restriction, including without limitation the rights
      7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      8 // copies of the Software, and to permit persons to whom the Software is
      9 // furnished to do so, subject to the following conditions:
     10 //
     11 // The above copyright notice and this permission notice shall be included in
     12 // all copies or substantial portions of the Software.
     13 //
     14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     20 // THE SOFTWARE.
     21 
     22 #if !_WIN32
     23 
     24 #ifndef _GNU_SOURCE
     25 #define _GNU_SOURCE
     26 #endif
     27 
     28 #include "filesystem.h"
     29 #include "debug.h"
     30 #include <sys/types.h>
     31 #include <sys/stat.h>
     32 #include <sys/ioctl.h>
     33 #include <fcntl.h>
     34 #include <unistd.h>
     35 #include <stdio.h>
     36 #include <sys/mman.h>
     37 #include <errno.h>
     38 #include <dirent.h>
     39 #include <stdlib.h>
     40 #include "vector.h"
     41 #include "miniposix.h"
     42 #include <algorithm>
     43 
     44 #if __linux__
     45 #include <syscall.h>
     46 #include <linux/fs.h>
     47 #include <sys/sendfile.h>
     48 #endif
     49 
     50 namespace kj {
     51 namespace {
     52 
     53 #define HIDDEN_PREFIX ".kj-tmp."
     54 // Prefix for temp files which should be hidden when listing a directory.
     55 //
     56 // If you change this, make sure to update the unit test.
     57 
     58 #ifdef O_CLOEXEC
     59 #define MAYBE_O_CLOEXEC O_CLOEXEC
     60 #else
     61 #define MAYBE_O_CLOEXEC 0
     62 #endif
     63 
     64 #ifdef O_DIRECTORY
     65 #define MAYBE_O_DIRECTORY O_DIRECTORY
     66 #else
     67 #define MAYBE_O_DIRECTORY 0
     68 #endif
     69 
     70 #if __APPLE__
     71 // Mac OSX defines SEEK_HOLE, but it doesn't work. ("Inappropriate ioctl for device", it says.)
     72 #undef SEEK_HOLE
     73 #endif
     74 
     75 #if __BIONIC__
     76 // No no DTTOIF function
     77 #undef DT_UNKNOWN
     78 #endif
     79 
     80 static void setCloexec(int fd) KJ_UNUSED;
     81 static void setCloexec(int fd) {
     82   // Set the O_CLOEXEC flag on the given fd.
     83   //
     84   // We try to avoid the need to call this by taking advantage of syscall flags that set it
     85   // atomically on new file descriptors. Unfortunately some platforms do not support such syscalls.
     86 
     87 #ifdef FIOCLEX
     88   // Yay, we can set the flag in one call.
     89   KJ_SYSCALL_HANDLE_ERRORS(ioctl(fd, FIOCLEX)) {
     90     case EINVAL:
     91     case EOPNOTSUPP:
     92       break;
     93     default:
     94       KJ_FAIL_SYSCALL("ioctl(fd, FIOCLEX)", error) { break; }
     95       break;
     96   } else {
     97     // success
     98     return;
     99   }
    100 #endif
    101 
    102   // Sadness, we must resort to read/modify/write.
    103   //
    104   // (On many platforms, FD_CLOEXEC is the only flag modifiable via F_SETFD and therefore we could
    105   // skip the read... but it seems dangerous to assume that's true of all platforms, and anyway
    106   // most platforms support FIOCLEX.)
    107   int flags;
    108   KJ_SYSCALL(flags = fcntl(fd, F_GETFD));
    109   if (!(flags & FD_CLOEXEC)) {
    110     KJ_SYSCALL(fcntl(fd, F_SETFD, flags | FD_CLOEXEC));
    111   }
    112 }
    113 
    114 static Date toKjDate(struct timespec tv) {
    115   return tv.tv_sec * SECONDS + tv.tv_nsec * NANOSECONDS + UNIX_EPOCH;
    116 }
    117 
    118 static FsNode::Type modeToType(mode_t mode) {
    119   switch (mode & S_IFMT) {
    120     case S_IFREG : return FsNode::Type::FILE;
    121     case S_IFDIR : return FsNode::Type::DIRECTORY;
    122     case S_IFLNK : return FsNode::Type::SYMLINK;
    123     case S_IFBLK : return FsNode::Type::BLOCK_DEVICE;
    124     case S_IFCHR : return FsNode::Type::CHARACTER_DEVICE;
    125     case S_IFIFO : return FsNode::Type::NAMED_PIPE;
    126     case S_IFSOCK: return FsNode::Type::SOCKET;
    127     default: return FsNode::Type::OTHER;
    128   }
    129 }
    130 
    131 static FsNode::Metadata statToMetadata(struct stat& stats) {
    132   // Probably st_ino and st_dev are usually under 32 bits, so mix by rotating st_dev left 32 bits
    133   // and XOR.
    134   uint64_t d = stats.st_dev;
    135   uint64_t hash = ((d << 32) | (d >> 32)) ^ stats.st_ino;
    136 
    137   return FsNode::Metadata {
    138     modeToType(stats.st_mode),
    139     implicitCast<uint64_t>(stats.st_size),
    140     implicitCast<uint64_t>(stats.st_blocks * 512u),
    141 #if __APPLE__
    142     toKjDate(stats.st_mtimespec),
    143 #else
    144     toKjDate(stats.st_mtim),
    145 #endif
    146     implicitCast<uint>(stats.st_nlink),
    147     hash
    148   };
    149 }
    150 
    151 static bool rmrf(int fd, StringPtr path);
    152 
    153 static void rmrfChildrenAndClose(int fd) {
    154   // Assumes fd is seeked to beginning.
    155 
    156   DIR* dir = fdopendir(fd);
    157   if (dir == nullptr) {
    158     close(fd);
    159     KJ_FAIL_SYSCALL("fdopendir", errno);
    160   };
    161   KJ_DEFER(closedir(dir));
    162 
    163   for (;;) {
    164     errno = 0;
    165     struct dirent* entry = readdir(dir);
    166     if (entry == nullptr) {
    167       int error = errno;
    168       if (error == 0) {
    169         break;
    170       } else {
    171         KJ_FAIL_SYSCALL("readdir", error);
    172       }
    173     }
    174 
    175     if (entry->d_name[0] == '.' &&
    176         (entry->d_name[1] == '\0' ||
    177          (entry->d_name[1] == '.' &&
    178           entry->d_name[2] == '\0'))) {
    179       // ignore . and ..
    180     } else {
    181 #ifdef DT_UNKNOWN    // d_type is not available on all platforms.
    182       if (entry->d_type == DT_DIR) {
    183         int subdirFd;
    184         KJ_SYSCALL(subdirFd = openat(
    185             fd, entry->d_name, O_RDONLY | MAYBE_O_DIRECTORY | MAYBE_O_CLOEXEC));
    186         rmrfChildrenAndClose(subdirFd);
    187         KJ_SYSCALL(unlinkat(fd, entry->d_name, AT_REMOVEDIR));
    188       } else if (entry->d_type != DT_UNKNOWN) {
    189         KJ_SYSCALL(unlinkat(fd, entry->d_name, 0));
    190       } else {
    191 #endif
    192         KJ_ASSERT(rmrf(fd, entry->d_name));
    193 #ifdef DT_UNKNOWN
    194       }
    195 #endif
    196     }
    197   }
    198 }
    199 
    200 static bool rmrf(int fd, StringPtr path) {
    201   struct stat stats;
    202   KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, path.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) {
    203     case ENOENT:
    204     case ENOTDIR:
    205       // Doesn't exist.
    206       return false;
    207     default:
    208       KJ_FAIL_SYSCALL("lstat(path)", error, path) { return false; }
    209   }
    210 
    211   if (S_ISDIR(stats.st_mode)) {
    212     int subdirFd;
    213     KJ_SYSCALL(subdirFd = openat(
    214         fd, path.cStr(), O_RDONLY | MAYBE_O_DIRECTORY | MAYBE_O_CLOEXEC)) { return false; }
    215     rmrfChildrenAndClose(subdirFd);
    216     KJ_SYSCALL(unlinkat(fd, path.cStr(), AT_REMOVEDIR)) { return false; }
    217   } else {
    218     KJ_SYSCALL(unlinkat(fd, path.cStr(), 0)) { return false; }
    219   }
    220 
    221   return true;
    222 }
    223 
    224 struct MmapRange {
    225   uint64_t offset;
    226   uint64_t size;
    227 };
    228 
    229 static MmapRange getMmapRange(uint64_t offset, uint64_t size) {
    230   // Comes up with an offset and size to pass to mmap(), given an offset and size requested by
    231   // the caller, and considering the fact that mappings must start at a page boundary.
    232   //
    233   // The offset is rounded down to the nearest page boundary, and the size is increased to
    234   // compensate. Note that the endpoint of the mapping is *not* rounded up to a page boundary, as
    235   // mmap() does not actually require this, and it causes trouble on some systems (notably Cygwin).
    236 
    237 #ifndef _SC_PAGESIZE
    238 #define _SC_PAGESIZE _SC_PAGE_SIZE
    239 #endif
    240   static const uint64_t pageSize = sysconf(_SC_PAGESIZE);
    241   uint64_t pageMask = pageSize - 1;
    242 
    243   uint64_t realOffset = offset & ~pageMask;
    244 
    245   return { realOffset, offset + size - realOffset };
    246 }
    247 
    248 class MmapDisposer: public ArrayDisposer {
    249 protected:
    250   void disposeImpl(void* firstElement, size_t elementSize, size_t elementCount,
    251                    size_t capacity, void (*destroyElement)(void*)) const {
    252     auto range = getMmapRange(reinterpret_cast<uintptr_t>(firstElement),
    253                               elementSize * elementCount);
    254     KJ_SYSCALL(munmap(reinterpret_cast<byte*>(range.offset), range.size)) { break; }
    255   }
    256 };
    257 
    258 constexpr MmapDisposer mmapDisposer = MmapDisposer();
    259 
    260 class DiskHandle {
    261   // We need to implement each of ReadableFile, AppendableFile, File, ReadableDirectory, and
    262   // Directory for disk handles. There is a lot of implementation overlap between these, especially
    263   // stat(), sync(), etc. We can't have everything inherit from a common DiskFsNode that implements
    264   // these because then we get diamond inheritance which means we need to make all our inheritance
    265   // virtual which means downcasting requires RTTI which violates our goal of supporting compiling
    266   // with no RTTI. So instead we have the DiskHandle class which implements all the methods without
    267   // inheriting anything, and then we have DiskFile, DiskDirectory, etc. hold this and delegate to
    268   // it. Ugly, but works.
    269 
    270 public:
    271   DiskHandle(AutoCloseFd&& fd): fd(kj::mv(fd)) {}
    272 
    273   // OsHandle ------------------------------------------------------------------
    274 
    275   AutoCloseFd clone() const {
    276     int fd2;
    277 #ifdef F_DUPFD_CLOEXEC
    278     KJ_SYSCALL_HANDLE_ERRORS(fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 3)) {
    279       case EINVAL:
    280       case EOPNOTSUPP:
    281         // fall back
    282         break;
    283       default:
    284         KJ_FAIL_SYSCALL("fnctl(fd, F_DUPFD_CLOEXEC, 3)", error) { break; }
    285         break;
    286     } else {
    287       return AutoCloseFd(fd2);
    288     }
    289 #endif
    290 
    291     KJ_SYSCALL(fd2 = ::dup(fd));
    292     AutoCloseFd result(fd2);
    293     setCloexec(result);
    294     return result;
    295   }
    296 
    297   int getFd() const {
    298     return fd.get();
    299   }
    300 
    301   // FsNode --------------------------------------------------------------------
    302 
    303   FsNode::Metadata stat() const {
    304     struct stat stats;
    305     KJ_SYSCALL(::fstat(fd, &stats));
    306     return statToMetadata(stats);
    307   }
    308 
    309   void sync() const {
    310 #if __APPLE__
    311     // For whatever reason, fsync() on OSX only flushes kernel buffers. It does not flush hardware
    312     // disk buffers. This makes it not very useful. But OSX documents fcntl F_FULLFSYNC which does
    313     // the right thing. Why they don't just make fsync() do the right thing, I do not know.
    314     KJ_SYSCALL(fcntl(fd, F_FULLFSYNC));
    315 #else
    316     KJ_SYSCALL(fsync(fd));
    317 #endif
    318   }
    319 
    320   void datasync() const {
    321     // The presence of the _POSIX_SYNCHRONIZED_IO define is supposed to tell us that fdatasync()
    322     // exists. But Apple defines this yet doesn't offer fdatasync(). Thanks, Apple.
    323 #if _POSIX_SYNCHRONIZED_IO && !__APPLE__
    324     KJ_SYSCALL(fdatasync(fd));
    325 #else
    326     this->sync();
    327 #endif
    328   }
    329 
    330   // ReadableFile --------------------------------------------------------------
    331 
    332   size_t read(uint64_t offset, ArrayPtr<byte> buffer) const {
    333     // pread() probably never returns short reads unless it hits EOF. Unfortunately, though, per
    334     // spec we are not allowed to assume this.
    335 
    336     size_t total = 0;
    337     while (buffer.size() > 0) {
    338       ssize_t n;
    339       KJ_SYSCALL(n = pread(fd, buffer.begin(), buffer.size(), offset));
    340       if (n == 0) break;
    341       total += n;
    342       offset += n;
    343       buffer = buffer.slice(n, buffer.size());
    344     }
    345     return total;
    346   }
    347 
    348   Array<const byte> mmap(uint64_t offset, uint64_t size) const {
    349     if (size == 0) return nullptr;  // zero-length mmap() returns EINVAL, so avoid it
    350     auto range = getMmapRange(offset, size);
    351     const void* mapping = ::mmap(NULL, range.size, PROT_READ, MAP_SHARED, fd, range.offset);
    352     if (mapping == MAP_FAILED) {
    353       KJ_FAIL_SYSCALL("mmap", errno);
    354     }
    355     return Array<const byte>(reinterpret_cast<const byte*>(mapping) + (offset - range.offset),
    356                              size, mmapDisposer);
    357   }
    358 
    359   Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const {
    360     if (size == 0) return nullptr;  // zero-length mmap() returns EINVAL, so avoid it
    361     auto range = getMmapRange(offset, size);
    362     void* mapping = ::mmap(NULL, range.size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, range.offset);
    363     if (mapping == MAP_FAILED) {
    364       KJ_FAIL_SYSCALL("mmap", errno);
    365     }
    366     return Array<byte>(reinterpret_cast<byte*>(mapping) + (offset - range.offset),
    367                        size, mmapDisposer);
    368   }
    369 
    370   // File ----------------------------------------------------------------------
    371 
    372   void write(uint64_t offset, ArrayPtr<const byte> data) const {
    373     // pwrite() probably never returns short writes unless there's no space left on disk.
    374     // Unfortunately, though, per spec we are not allowed to assume this.
    375 
    376     while (data.size() > 0) {
    377       ssize_t n;
    378       KJ_SYSCALL(n = pwrite(fd, data.begin(), data.size(), offset));
    379       KJ_ASSERT(n > 0, "pwrite() returned zero?");
    380       offset += n;
    381       data = data.slice(n, data.size());
    382     }
    383   }
    384 
    385   void zero(uint64_t offset, uint64_t size) const {
    386     // If FALLOC_FL_PUNCH_HOLE is defined, use it to efficiently zero the area.
    387     //
    388     // A fallocate() wrapper was only added to Android's Bionic C library as of API level 21,
    389     // but FALLOC_FL_PUNCH_HOLE is apparently defined in the headers before that, so we'll
    390     // have to explicitly test for that case.
    391 #if defined(FALLOC_FL_PUNCH_HOLE) && !(__ANDROID__ && __BIONIC__ && __ANDROID_API__ < 21)
    392     KJ_SYSCALL_HANDLE_ERRORS(
    393         fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, size)) {
    394       case EOPNOTSUPP:
    395         // fall back to below
    396         break;
    397       default:
    398         KJ_FAIL_SYSCALL("fallocate(FALLOC_FL_PUNCH_HOLE)", error) { return; }
    399     } else {
    400       return;
    401     }
    402 #endif
    403 
    404     static const byte ZEROS[4096] = { 0 };
    405 
    406 #if __APPLE__ || __CYGWIN__ || (defined(__ANDROID__) && __ANDROID_API__ < 24)
    407     // Mac & Cygwin & Android API levels 23 and lower doesn't have pwritev().
    408     while (size > sizeof(ZEROS)) {
    409       write(offset, ZEROS);
    410       size -= sizeof(ZEROS);
    411       offset += sizeof(ZEROS);
    412     }
    413     write(offset, kj::arrayPtr(ZEROS, size));
    414 #else
    415     // Use a 4k buffer of zeros amplified by iov to write zeros with as few syscalls as possible.
    416     size_t count = (size + sizeof(ZEROS) - 1) / sizeof(ZEROS);
    417     const size_t iovmax = miniposix::iovMax();
    418     KJ_STACK_ARRAY(struct iovec, iov, kj::min(iovmax, count), 16, 256);
    419 
    420     for (auto& item: iov) {
    421       item.iov_base = const_cast<byte*>(ZEROS);
    422       item.iov_len = sizeof(ZEROS);
    423     }
    424 
    425     while (size > 0) {
    426       size_t iovCount;
    427       if (size >= iov.size() * sizeof(ZEROS)) {
    428         iovCount = iov.size();
    429       } else {
    430         iovCount = size / sizeof(ZEROS);
    431         size_t rem = size % sizeof(ZEROS);
    432         if (rem > 0) {
    433           iov[iovCount++].iov_len = rem;
    434         }
    435       }
    436 
    437       ssize_t n;
    438       KJ_SYSCALL(n = pwritev(fd, iov.begin(), count, offset));
    439       KJ_ASSERT(n > 0, "pwrite() returned zero?");
    440 
    441       offset += n;
    442       size -= n;
    443     }
    444 #endif
    445   }
    446 
    447   void truncate(uint64_t size) const {
    448     KJ_SYSCALL(ftruncate(fd, size));
    449   }
    450 
    451   class WritableFileMappingImpl final: public WritableFileMapping {
    452   public:
    453     WritableFileMappingImpl(Array<byte> bytes): bytes(kj::mv(bytes)) {}
    454 
    455     ArrayPtr<byte> get() const override {
    456       // const_cast OK because WritableFileMapping does indeed provide a writable view despite
    457       // being const itself.
    458       return arrayPtr(const_cast<byte*>(bytes.begin()), bytes.size());
    459     }
    460 
    461     void changed(ArrayPtr<byte> slice) const override {
    462       KJ_REQUIRE(slice.begin() >= bytes.begin() && slice.end() <= bytes.end(),
    463                  "byte range is not part of this mapping");
    464       if (slice.size() == 0) return;
    465 
    466       // msync() requires page-alignment, apparently, so use getMmapRange() to accomplish that.
    467       auto range = getMmapRange(reinterpret_cast<uintptr_t>(slice.begin()), slice.size());
    468       KJ_SYSCALL(msync(reinterpret_cast<void*>(range.offset), range.size, MS_ASYNC));
    469     }
    470 
    471     void sync(ArrayPtr<byte> slice) const override {
    472       KJ_REQUIRE(slice.begin() >= bytes.begin() && slice.end() <= bytes.end(),
    473                  "byte range is not part of this mapping");
    474       if (slice.size() == 0) return;
    475 
    476       // msync() requires page-alignment, apparently, so use getMmapRange() to accomplish that.
    477       auto range = getMmapRange(reinterpret_cast<uintptr_t>(slice.begin()), slice.size());
    478       KJ_SYSCALL(msync(reinterpret_cast<void*>(range.offset), range.size, MS_SYNC));
    479     }
    480 
    481   private:
    482     Array<byte> bytes;
    483   };
    484 
    485   Own<const WritableFileMapping> mmapWritable(uint64_t offset, uint64_t size) const {
    486     if (size == 0) {
    487       // zero-length mmap() returns EINVAL, so avoid it
    488       return heap<WritableFileMappingImpl>(nullptr);
    489     }
    490     auto range = getMmapRange(offset, size);
    491     void* mapping = ::mmap(NULL, range.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, range.offset);
    492     if (mapping == MAP_FAILED) {
    493       KJ_FAIL_SYSCALL("mmap", errno);
    494     }
    495     auto array = Array<byte>(reinterpret_cast<byte*>(mapping) + (offset - range.offset),
    496                              size, mmapDisposer);
    497     return heap<WritableFileMappingImpl>(kj::mv(array));
    498   }
    499 
    500   size_t copyChunk(uint64_t offset, int fromFd, uint64_t fromOffset, uint64_t size) const {
    501     // Copies a range of bytes from `fromFd` to this file in the most efficient way possible for
    502     // the OS. Only returns less than `size` if EOF. Does not account for holes.
    503 
    504 #if __linux__
    505     {
    506       KJ_SYSCALL(lseek(fd, offset, SEEK_SET));
    507       off_t fromPos = fromOffset;
    508       off_t end = fromOffset + size;
    509       while (fromPos < end) {
    510         ssize_t n;
    511         KJ_SYSCALL_HANDLE_ERRORS(n = sendfile(fd, fromFd, &fromPos, end - fromPos)) {
    512           case EINVAL:
    513           case ENOSYS:
    514             goto sendfileNotAvailable;
    515           default:
    516             KJ_FAIL_SYSCALL("sendfile", error) { return fromPos - fromOffset; }
    517         }
    518         if (n == 0) break;
    519       }
    520       return fromPos - fromOffset;
    521     }
    522 
    523   sendfileNotAvailable:
    524 #endif
    525     uint64_t total = 0;
    526     while (size > 0) {
    527       byte buffer[4096];
    528       ssize_t n;
    529       KJ_SYSCALL(n = pread(fromFd, buffer, kj::min(sizeof(buffer), size), fromOffset));
    530       if (n == 0) break;
    531       write(offset, arrayPtr(buffer, n));
    532       fromOffset += n;
    533       offset += n;
    534       total += n;
    535       size -= n;
    536     }
    537     return total;
    538   }
    539 
    540   kj::Maybe<size_t> copy(uint64_t offset, const ReadableFile& from,
    541                          uint64_t fromOffset, uint64_t size) const {
    542     KJ_IF_MAYBE(otherFd, from.getFd()) {
    543 #ifdef FICLONE
    544       if (offset == 0 && fromOffset == 0 && size == kj::maxValue && stat().size == 0) {
    545         if (ioctl(fd, FICLONE, *otherFd) >= 0) {
    546           return stat().size;
    547         }
    548       } else if (size > 0) {    // src_length = 0 has special meaning for the syscall, so avoid.
    549         struct file_clone_range range;
    550         memset(&range, 0, sizeof(range));
    551         range.src_fd = *otherFd;
    552         range.dest_offset = offset;
    553         range.src_offset = fromOffset;
    554         range.src_length = size == kj::maxValue ? 0 : size;
    555         if (ioctl(fd, FICLONERANGE, &range) >= 0) {
    556           // TODO(someday): What does FICLONERANGE actually do if the range goes past EOF? The docs
    557           //   don't say. Maybe it only copies the parts that exist. Maybe it punches holes for the
    558           //   rest. Where does the destination file's EOF marker end up? Who knows?
    559           return kj::min(from.stat().size - fromOffset, size);
    560         }
    561       } else {
    562         // size == 0
    563         return size_t(0);
    564       }
    565 
    566       // ioctl failed. Almost all failures documented for these are of the form "the operation is
    567       // not supported for the filesystem(s) specified", so fall back to other approaches.
    568 #endif
    569 
    570       off_t toPos = offset;
    571       off_t fromPos = fromOffset;
    572       off_t end = size == kj::maxValue ? off_t(kj::maxValue) : off_t(fromOffset + size);
    573 
    574       for (;;) {
    575         // Handle data.
    576         {
    577           // Find out how much data there is before the next hole.
    578           off_t nextHole;
    579 #ifdef SEEK_HOLE
    580           KJ_SYSCALL_HANDLE_ERRORS(nextHole = lseek(*otherFd, fromPos, SEEK_HOLE)) {
    581             case EINVAL:
    582               // SEEK_HOLE probably not supported. Assume no holes.
    583               nextHole = end;
    584               break;
    585             case ENXIO:
    586               // Past EOF. Stop here.
    587               return fromPos - fromOffset;
    588             default:
    589               KJ_FAIL_SYSCALL("lseek(fd, pos, SEEK_HOLE)", error) { return fromPos - fromOffset; }
    590           }
    591 #else
    592           // SEEK_HOLE not supported. Assume no holes.
    593           nextHole = end;
    594 #endif
    595 
    596           // Copy the next chunk of data.
    597           off_t copyTo = kj::min(end, nextHole);
    598           size_t amount = copyTo - fromPos;
    599           if (amount > 0) {
    600             size_t n = copyChunk(toPos, *otherFd, fromPos, amount);
    601             fromPos += n;
    602             toPos += n;
    603 
    604             if (n < amount) {
    605               return fromPos - fromOffset;
    606             }
    607           }
    608 
    609           if (fromPos == end) {
    610             return fromPos - fromOffset;
    611           }
    612         }
    613 
    614 #ifdef SEEK_HOLE
    615         // Handle hole.
    616         {
    617           // Find out how much hole there is before the next data.
    618           off_t nextData;
    619           KJ_SYSCALL_HANDLE_ERRORS(nextData = lseek(*otherFd, fromPos, SEEK_DATA)) {
    620             case EINVAL:
    621               // SEEK_DATA probably not supported. But we should only have gotten here if we
    622               // were expecting a hole.
    623               KJ_FAIL_ASSERT("can't determine hole size; SEEK_DATA not supported");
    624               break;
    625             case ENXIO:
    626               // No more data. Set to EOF.
    627               KJ_SYSCALL(nextData = lseek(*otherFd, 0, SEEK_END));
    628               if (nextData > end) {
    629                 end = nextData;
    630               }
    631               break;
    632             default:
    633               KJ_FAIL_SYSCALL("lseek(fd, pos, SEEK_HOLE)", error) { return fromPos - fromOffset; }
    634           }
    635 
    636           // Write zeros.
    637           off_t zeroTo = kj::min(end, nextData);
    638           off_t amount = zeroTo - fromPos;
    639           if (amount > 0) {
    640             zero(toPos, amount);
    641             toPos += amount;
    642             fromPos = zeroTo;
    643           }
    644 
    645           if (fromPos == end) {
    646             return fromPos - fromOffset;
    647           }
    648         }
    649 #endif
    650       }
    651     }
    652 
    653     // Indicates caller should call File::copy() default implementation.
    654     return nullptr;
    655   }
    656 
    657   // ReadableDirectory ---------------------------------------------------------
    658 
    659   template <typename Func>
    660   auto list(bool needTypes, Func&& func) const
    661       -> Array<Decay<decltype(func(instance<StringPtr>(), instance<FsNode::Type>()))>> {
    662     // Seek to start of directory.
    663     KJ_SYSCALL(lseek(fd, 0, SEEK_SET));
    664 
    665     // Unfortunately, fdopendir() takes ownership of the file descriptor. Therefore we need to
    666     // make a duplicate.
    667     int duped;
    668     KJ_SYSCALL(duped = dup(fd));
    669     DIR* dir = fdopendir(duped);
    670     if (dir == nullptr) {
    671       close(duped);
    672       KJ_FAIL_SYSCALL("fdopendir", errno);
    673     }
    674 
    675     KJ_DEFER(closedir(dir));
    676     typedef Decay<decltype(func(instance<StringPtr>(), instance<FsNode::Type>()))> Entry;
    677     kj::Vector<Entry> entries;
    678 
    679     for (;;) {
    680       errno = 0;
    681       struct dirent* entry = readdir(dir);
    682       if (entry == nullptr) {
    683         int error = errno;
    684         if (error == 0) {
    685           break;
    686         } else {
    687           KJ_FAIL_SYSCALL("readdir", error);
    688         }
    689       }
    690 
    691       kj::StringPtr name = entry->d_name;
    692       if (name != "." && name != ".." && !name.startsWith(HIDDEN_PREFIX)) {
    693 #ifdef DT_UNKNOWN    // d_type is not available on all platforms.
    694         if (entry->d_type != DT_UNKNOWN) {
    695           entries.add(func(name, modeToType(DTTOIF(entry->d_type))));
    696         } else {
    697 #endif
    698           if (needTypes) {
    699             // Unknown type. Fall back to stat.
    700             struct stat stats;
    701             KJ_SYSCALL(fstatat(fd, name.cStr(), &stats, AT_SYMLINK_NOFOLLOW));
    702             entries.add(func(name, modeToType(stats.st_mode)));
    703           } else {
    704             entries.add(func(name, FsNode::Type::OTHER));
    705           }
    706 #ifdef DT_UNKNOWN
    707         }
    708 #endif
    709       }
    710     }
    711 
    712     auto result = entries.releaseAsArray();
    713     std::sort(result.begin(), result.end());
    714     return result;
    715   }
    716 
    717   Array<String> listNames() const {
    718     return list(false, [](StringPtr name, FsNode::Type type) { return heapString(name); });
    719   }
    720 
    721   Array<ReadableDirectory::Entry> listEntries() const {
    722     return list(true, [](StringPtr name, FsNode::Type type) {
    723       return ReadableDirectory::Entry { type, heapString(name), };
    724     });
    725   }
    726 
    727   bool exists(PathPtr path) const {
    728     KJ_SYSCALL_HANDLE_ERRORS(faccessat(fd, path.toString().cStr(), F_OK, 0)) {
    729       case ENOENT:
    730       case ENOTDIR:
    731         return false;
    732       default:
    733         KJ_FAIL_SYSCALL("faccessat(fd, path)", error, path) { return false; }
    734     }
    735     return true;
    736   }
    737 
    738   Maybe<FsNode::Metadata> tryLstat(PathPtr path) const {
    739     struct stat stats;
    740     KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, path.toString().cStr(), &stats, AT_SYMLINK_NOFOLLOW)) {
    741       case ENOENT:
    742       case ENOTDIR:
    743         return nullptr;
    744       default:
    745         KJ_FAIL_SYSCALL("faccessat(fd, path)", error, path) { return nullptr; }
    746     }
    747     return statToMetadata(stats);
    748   }
    749 
    750   Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const {
    751     int newFd;
    752     KJ_SYSCALL_HANDLE_ERRORS(newFd = openat(
    753         fd, path.toString().cStr(), O_RDONLY | MAYBE_O_CLOEXEC)) {
    754       case ENOENT:
    755       case ENOTDIR:
    756         return nullptr;
    757       default:
    758         KJ_FAIL_SYSCALL("openat(fd, path, O_RDONLY)", error, path) { return nullptr; }
    759     }
    760 
    761     kj::AutoCloseFd result(newFd);
    762 #ifndef O_CLOEXEC
    763     setCloexec(result);
    764 #endif
    765 
    766     return newDiskReadableFile(kj::mv(result));
    767   }
    768 
    769   Maybe<AutoCloseFd> tryOpenSubdirInternal(PathPtr path) const {
    770     int newFd;
    771     KJ_SYSCALL_HANDLE_ERRORS(newFd = openat(
    772         fd, path.toString().cStr(), O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)) {
    773       case ENOENT:
    774         return nullptr;
    775       case ENOTDIR:
    776         // Could mean that a parent is not a directory, which we treat as "doesn't exist".
    777         // Could also mean that the specified file is not a directory, which should throw.
    778         // Check using exists().
    779         if (!exists(path)) {
    780           return nullptr;
    781         }
    782         KJ_FALLTHROUGH;
    783       default:
    784         KJ_FAIL_SYSCALL("openat(fd, path, O_DIRECTORY)", error, path) { return nullptr; }
    785     }
    786 
    787     kj::AutoCloseFd result(newFd);
    788 #ifndef O_CLOEXEC
    789     setCloexec(result);
    790 #endif
    791 
    792     return kj::mv(result);
    793   }
    794 
    795   Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const {
    796     return tryOpenSubdirInternal(path).map(newDiskReadableDirectory);
    797   }
    798 
    799   Maybe<String> tryReadlink(PathPtr path) const {
    800     size_t trySize = 256;
    801     for (;;) {
    802       KJ_STACK_ARRAY(char, buf, trySize, 256, 4096);
    803       ssize_t n = readlinkat(fd, path.toString().cStr(), buf.begin(), buf.size());
    804       if (n < 0) {
    805         int error = errno;
    806         switch (error) {
    807           case EINTR:
    808             continue;
    809           case ENOENT:
    810           case ENOTDIR:
    811           case EINVAL:    // not a link
    812             return nullptr;
    813           default:
    814             KJ_FAIL_SYSCALL("readlinkat(fd, path)", error, path) { return nullptr; }
    815         }
    816       }
    817 
    818       if (n >= buf.size()) {
    819         // Didn't give it enough space. Better retry with a bigger buffer.
    820         trySize *= 2;
    821         continue;
    822       }
    823 
    824       return heapString(buf.begin(), n);
    825     }
    826   }
    827 
    828   // Directory -----------------------------------------------------------------
    829 
    830   bool tryMkdir(PathPtr path, WriteMode mode, bool noThrow) const {
    831     // Internal function to make a directory.
    832 
    833     auto filename = path.toString();
    834     mode_t acl = has(mode, WriteMode::PRIVATE) ? 0700 : 0777;
    835 
    836     KJ_SYSCALL_HANDLE_ERRORS(mkdirat(fd, filename.cStr(), acl)) {
    837       case EEXIST: {
    838         // Apparently this path exists.
    839         if (!has(mode, WriteMode::MODIFY)) {
    840           // Require exclusive create.
    841           return false;
    842         }
    843 
    844         // MODIFY is allowed, so we just need to check whether the existing entry is a directory.
    845         struct stat stats;
    846         KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, filename.cStr(), &stats, 0)) {
    847           default:
    848             // mkdir() says EEXIST but we can't stat it. Maybe it's a dangling link, or maybe
    849             // we can't access it for some reason. Assume failure.
    850             //
    851             // TODO(someday): Maybe we should be creating the directory at the target of the
    852             //   link?
    853             goto failed;
    854         }
    855         return (stats.st_mode & S_IFMT) == S_IFDIR;
    856       }
    857       case ENOENT:
    858         if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 &&
    859             tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY |
    860                                     WriteMode::CREATE_PARENT, true)) {
    861           // Retry, but make sure we don't try to create the parent again.
    862           return tryMkdir(path, mode - WriteMode::CREATE_PARENT, noThrow);
    863         } else {
    864           goto failed;
    865         }
    866       default:
    867       failed:
    868         if (noThrow) {
    869           // Caller requested no throwing.
    870           return false;
    871         } else {
    872           KJ_FAIL_SYSCALL("mkdirat(fd, path)", error, path);
    873         }
    874     }
    875 
    876     return true;
    877   }
    878 
    879   kj::Maybe<String> createNamedTemporary(
    880       PathPtr finalName, WriteMode mode, Function<int(StringPtr)> tryCreate) const {
    881     // Create a temporary file which will eventually replace `finalName`.
    882     //
    883     // Calls `tryCreate` to actually create the temporary, passing in the desired path. tryCreate()
    884     // is expected to behave like a syscall, returning a negative value and setting `errno` on
    885     // error. tryCreate() MUST fail with EEXIST if the path exists -- this is not checked in
    886     // advance, since it needs to be checked atomically. In the case of EEXIST, tryCreate() will
    887     // be called again with a new path.
    888     //
    889     // Returns the temporary path that succeeded. Only returns nullptr if there was an exception
    890     // but we're compiled with -fno-exceptions.
    891 
    892     if (finalName.size() == 0) {
    893       KJ_FAIL_REQUIRE("can't replace self") { break; }
    894       return nullptr;
    895     }
    896 
    897     static uint counter = 0;
    898     static const pid_t pid = getpid();
    899     String pathPrefix;
    900     if (finalName.size() > 1) {
    901       pathPrefix = kj::str(finalName.parent(), '/');
    902     }
    903     auto path = kj::str(pathPrefix, HIDDEN_PREFIX, pid, '.', counter++, '.',
    904                         finalName.basename()[0], ".partial");
    905 
    906     KJ_SYSCALL_HANDLE_ERRORS(tryCreate(path)) {
    907       case EEXIST:
    908         return createNamedTemporary(finalName, mode, kj::mv(tryCreate));
    909       case ENOENT:
    910         if (has(mode, WriteMode::CREATE_PARENT) && finalName.size() > 1 &&
    911             tryMkdir(finalName.parent(), WriteMode::CREATE | WriteMode::MODIFY |
    912                                          WriteMode::CREATE_PARENT, true)) {
    913           // Retry, but make sure we don't try to create the parent again.
    914           mode = mode - WriteMode::CREATE_PARENT;
    915           return createNamedTemporary(finalName, mode, kj::mv(tryCreate));
    916         }
    917         KJ_FALLTHROUGH;
    918       default:
    919         KJ_FAIL_SYSCALL("create(path)", error, path) { break; }
    920         return nullptr;
    921     }
    922 
    923     return kj::mv(path);
    924   }
    925 
    926   bool tryReplaceNode(PathPtr path, WriteMode mode, Function<int(StringPtr)> tryCreate) const {
    927     // Replaces the given path with an object created by calling tryCreate().
    928     //
    929     // tryCreate() must behave like a syscall which creates the node at the path passed to it,
    930     // returning a negative value on error. If the path passed to tryCreate already exists, it
    931     // MUST fail with EEXIST.
    932     //
    933     // When `mode` includes MODIFY, replaceNode() reacts to EEXIST by creating the node in a
    934     // temporary location and then rename()ing it into place.
    935 
    936     if (path.size() == 0) {
    937       KJ_FAIL_REQUIRE("can't replace self") { return false; }
    938     }
    939 
    940     auto filename = path.toString();
    941 
    942     if (has(mode, WriteMode::CREATE)) {
    943       // First try just cerating the node in-place.
    944       KJ_SYSCALL_HANDLE_ERRORS(tryCreate(filename)) {
    945         case EEXIST:
    946           // Target exists.
    947           if (has(mode, WriteMode::MODIFY)) {
    948             // Fall back to MODIFY path, below.
    949             break;
    950           } else {
    951             return false;
    952           }
    953         case ENOENT:
    954           if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 &&
    955               tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY |
    956                                       WriteMode::CREATE_PARENT, true)) {
    957             // Retry, but make sure we don't try to create the parent again.
    958             return tryReplaceNode(path, mode - WriteMode::CREATE_PARENT, kj::mv(tryCreate));
    959           }
    960           KJ_FALLTHROUGH;
    961         default:
    962           KJ_FAIL_SYSCALL("create(path)", error, path) { return false; }
    963       } else {
    964         // Success.
    965         return true;
    966       }
    967     }
    968 
    969     // Either we don't have CREATE mode or the target already exists. We need to perform a
    970     // replacement instead.
    971 
    972     KJ_IF_MAYBE(tempPath, createNamedTemporary(path, mode, kj::mv(tryCreate))) {
    973       if (tryCommitReplacement(filename, fd, *tempPath, mode)) {
    974         return true;
    975       } else {
    976         KJ_SYSCALL_HANDLE_ERRORS(unlinkat(fd, tempPath->cStr(), 0)) {
    977           case ENOENT:
    978             // meh
    979             break;
    980           default:
    981             KJ_FAIL_SYSCALL("unlinkat(fd, tempPath, 0)", error, *tempPath);
    982         }
    983         return false;
    984       }
    985     } else {
    986       // threw, but exceptions are disabled
    987       return false;
    988     }
    989   }
    990 
    991   Maybe<AutoCloseFd> tryOpenFileInternal(PathPtr path, WriteMode mode, bool append) const {
    992     uint flags = O_RDWR | MAYBE_O_CLOEXEC;
    993     mode_t acl = 0666;
    994     if (has(mode, WriteMode::CREATE)) {
    995       flags |= O_CREAT;
    996     }
    997     if (!has(mode, WriteMode::MODIFY)) {
    998       if (!has(mode, WriteMode::CREATE)) {
    999         // Neither CREATE nor MODIFY -- impossible to satisfy preconditions.
   1000         return nullptr;
   1001       }
   1002       flags |= O_EXCL;
   1003     }
   1004     if (append) {
   1005       flags |= O_APPEND;
   1006     }
   1007     if (has(mode, WriteMode::EXECUTABLE)) {
   1008       acl = 0777;
   1009     }
   1010     if (has(mode, WriteMode::PRIVATE)) {
   1011       acl &= 0700;
   1012     }
   1013 
   1014     auto filename = path.toString();
   1015 
   1016     int newFd;
   1017     KJ_SYSCALL_HANDLE_ERRORS(newFd = openat(fd, filename.cStr(), flags, acl)) {
   1018       case ENOENT:
   1019         if (has(mode, WriteMode::CREATE)) {
   1020           // Either:
   1021           // - The file is a broken symlink.
   1022           // - A parent directory didn't exist.
   1023           if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 &&
   1024               tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY |
   1025                                       WriteMode::CREATE_PARENT, true)) {
   1026             // Retry, but make sure we don't try to create the parent again.
   1027             return tryOpenFileInternal(path, mode - WriteMode::CREATE_PARENT, append);
   1028           }
   1029 
   1030           // Check for broken link.
   1031           if (!has(mode, WriteMode::MODIFY) &&
   1032               faccessat(fd, filename.cStr(), F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
   1033             // Yep. We treat this as already-exists, which means in CREATE-only mode this is a
   1034             // simple failure.
   1035             return nullptr;
   1036           }
   1037 
   1038           KJ_FAIL_REQUIRE("parent is not a directory", path) { return nullptr; }
   1039         } else {
   1040           // MODIFY-only mode. ENOENT = doesn't exist = return null.
   1041           return nullptr;
   1042         }
   1043       case ENOTDIR:
   1044         if (!has(mode, WriteMode::CREATE)) {
   1045           // MODIFY-only mode. ENOTDIR = parent not a directory = doesn't exist = return null.
   1046           return nullptr;
   1047         }
   1048         goto failed;
   1049       case EEXIST:
   1050         if (!has(mode, WriteMode::MODIFY)) {
   1051           // CREATE-only mode. EEXIST = already exists = return null.
   1052           return nullptr;
   1053         }
   1054         goto failed;
   1055       default:
   1056       failed:
   1057         KJ_FAIL_SYSCALL("openat(fd, path, O_RDWR | ...)", error, path) { return nullptr; }
   1058     }
   1059 
   1060     kj::AutoCloseFd result(newFd);
   1061 #ifndef O_CLOEXEC
   1062     setCloexec(result);
   1063 #endif
   1064 
   1065     return kj::mv(result);
   1066   }
   1067 
   1068   bool tryCommitReplacement(StringPtr toPath, int fromDirFd, StringPtr fromPath, WriteMode mode,
   1069                             int* errorReason = nullptr) const {
   1070     if (has(mode, WriteMode::CREATE) && has(mode, WriteMode::MODIFY)) {
   1071       // Always clobber. Try it.
   1072       KJ_SYSCALL_HANDLE_ERRORS(renameat(fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr())) {
   1073         case EISDIR:
   1074         case ENOTDIR:
   1075         case ENOTEMPTY:
   1076         case EEXIST:
   1077           // Failed because target exists and due to the various weird quirks of rename(), it
   1078           // can't remove it for us. On Linux we can try an exchange instead. On others we have
   1079           // to move the target out of the way.
   1080           break;
   1081         default:
   1082           if (errorReason == nullptr) {
   1083             KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) { return false; }
   1084           } else {
   1085             *errorReason = error;
   1086             return false;
   1087           }
   1088       } else {
   1089         return true;
   1090       }
   1091     }
   1092 
   1093 #if __linux__ && defined(RENAME_EXCHANGE)
   1094     // Try to use Linux's renameat2() to atomically check preconditions and apply.
   1095 
   1096     if (has(mode, WriteMode::MODIFY)) {
   1097       // Use an exchange to implement modification.
   1098       //
   1099       // We reach this branch when performing a MODIFY-only, or when performing a CREATE | MODIFY
   1100       // in which we determined above that there's a node of a different type blocking the
   1101       // exchange.
   1102 
   1103       KJ_SYSCALL_HANDLE_ERRORS(syscall(SYS_renameat2,
   1104           fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr(), RENAME_EXCHANGE)) {
   1105         case ENOSYS:  // Syscall not supported by kernel.
   1106         case EINVAL:  // Maybe we screwed up, or maybe the syscall is not supported by the
   1107                       // filesystem. Unfortunately, there's no way to tell, so assume the latter.
   1108                       // ZFS in particular apparently produces EINVAL.
   1109           break;  // fall back to traditional means
   1110         case ENOENT:
   1111           // Presumably because the target path doesn't exist.
   1112           if (has(mode, WriteMode::CREATE)) {
   1113             KJ_FAIL_ASSERT("rename(tmp, path) claimed path exists but "
   1114                 "renameat2(fromPath, toPath, EXCAHNGE) said it doest; concurrent modification?",
   1115                 fromPath, toPath) { return false; }
   1116           } else {
   1117             // Assume target doesn't exist.
   1118             return false;
   1119           }
   1120         default:
   1121           if (errorReason == nullptr) {
   1122             KJ_FAIL_SYSCALL("renameat2(fromPath, toPath, EXCHANGE)", error, fromPath, toPath) {
   1123               return false;
   1124             }
   1125           } else {
   1126             *errorReason = error;
   1127             return false;
   1128           }
   1129       } else {
   1130         // Successful swap! Delete swapped-out content.
   1131         rmrf(fromDirFd, fromPath);
   1132         return true;
   1133       }
   1134     } else if (has(mode, WriteMode::CREATE)) {
   1135       KJ_SYSCALL_HANDLE_ERRORS(syscall(SYS_renameat2,
   1136           fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr(), RENAME_NOREPLACE)) {
   1137         case ENOSYS:  // Syscall not supported by kernel.
   1138         case EINVAL:  // Maybe we screwed up, or maybe the syscall is not supported by the
   1139                       // filesystem. Unfortunately, there's no way to tell, so assume the latter.
   1140                       // ZFS in particular apparently produces EINVAL.
   1141           break;  // fall back to traditional means
   1142         case EEXIST:
   1143           return false;
   1144         default:
   1145           if (errorReason == nullptr) {
   1146             KJ_FAIL_SYSCALL("renameat2(fromPath, toPath, NOREPLACE)", error, fromPath, toPath) {
   1147               return false;
   1148             }
   1149           } else {
   1150             *errorReason = error;
   1151             return false;
   1152           }
   1153       } else {
   1154         return true;
   1155       }
   1156     }
   1157 #endif
   1158 
   1159     // We're unable to do what we wanted atomically. :(
   1160 
   1161     if (has(mode, WriteMode::CREATE) && has(mode, WriteMode::MODIFY)) {
   1162       // We failed to atomically delete the target previously. So now we need to do two calls in
   1163       // rapid succession to move the old file away then move the new one into place.
   1164 
   1165       // Find out what kind of file exists at the target path.
   1166       struct stat stats;
   1167       KJ_SYSCALL(fstatat(fd, toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { return false; }
   1168 
   1169       // Create a temporary location to move the existing object to. Note that rename() allows a
   1170       // non-directory to replace a non-directory, and allows a directory to replace an empty
   1171       // directory. So we have to create the right type.
   1172       Path toPathParsed = Path::parse(toPath);
   1173       String away;
   1174       KJ_IF_MAYBE(awayPath, createNamedTemporary(toPathParsed, WriteMode::CREATE,
   1175           [&](StringPtr candidatePath) {
   1176         if (S_ISDIR(stats.st_mode)) {
   1177           return mkdirat(fd, candidatePath.cStr(), 0700);
   1178         } else {
   1179 #if __APPLE__ || __FreeBSD__
   1180           // - No mknodat() on OSX, gotta open() a file, ugh.
   1181           // - On a modern FreeBSD, mknodat() is reserved strictly for device nodes,
   1182           //   you cannot create a regular file using it (EINVAL).
   1183           int newFd = openat(fd, candidatePath.cStr(),
   1184                              O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, 0700);
   1185           if (newFd >= 0) close(newFd);
   1186           return newFd;
   1187 #else
   1188           return mknodat(fd, candidatePath.cStr(), S_IFREG | 0600, dev_t());
   1189 #endif
   1190         }
   1191       })) {
   1192         away = kj::mv(*awayPath);
   1193       } else {
   1194         // Already threw.
   1195         return false;
   1196       }
   1197 
   1198       // OK, now move the target object to replace the thing we just created.
   1199       KJ_SYSCALL(renameat(fd, toPath.cStr(), fd, away.cStr())) {
   1200         // Something went wrong. Remove the thing we just created.
   1201         unlinkat(fd, away.cStr(), S_ISDIR(stats.st_mode) ? AT_REMOVEDIR : 0);
   1202         return false;
   1203       }
   1204 
   1205       // Now move the source object to the target location.
   1206       KJ_SYSCALL_HANDLE_ERRORS(renameat(fromDirFd, fromPath.cStr(), fd, toPath.cStr())) {
   1207         default:
   1208           // Try to put things back where they were. If this fails, though, then we have little
   1209           // choice but to leave things broken.
   1210           KJ_SYSCALL_HANDLE_ERRORS(renameat(fd, away.cStr(), fd, toPath.cStr())) {
   1211             default: break;
   1212           }
   1213 
   1214           if (errorReason == nullptr) {
   1215             KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) {
   1216               return false;
   1217             }
   1218           } else {
   1219             *errorReason = error;
   1220             return false;
   1221           }
   1222       }
   1223 
   1224       // OK, success. Delete the old content.
   1225       rmrf(fd, away);
   1226       return true;
   1227     } else {
   1228       // Only one of CREATE or MODIFY is specified, so we need to verify non-atomically that the
   1229       // corresponding precondition (must-not-exist or must-exist, respectively) is held.
   1230       if (has(mode, WriteMode::CREATE)) {
   1231         struct stat stats;
   1232         KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd.get(), toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) {
   1233           case ENOENT:
   1234           case ENOTDIR:
   1235             break;  // doesn't exist; continue
   1236           default:
   1237             KJ_FAIL_SYSCALL("fstatat(fd, toPath)", error, toPath) { return false; }
   1238         } else {
   1239           return false;  // already exists; fail
   1240         }
   1241       } else if (has(mode, WriteMode::MODIFY)) {
   1242         struct stat stats;
   1243         KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd.get(), toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) {
   1244           case ENOENT:
   1245           case ENOTDIR:
   1246             return false;  // doesn't exist; fail
   1247           default:
   1248             KJ_FAIL_SYSCALL("fstatat(fd, toPath)", error, toPath) { return false; }
   1249         } else {
   1250           // already exists; continue
   1251         }
   1252       } else {
   1253         // Neither CREATE nor MODIFY.
   1254         return false;
   1255       }
   1256 
   1257       // Start over in create-and-modify mode.
   1258       return tryCommitReplacement(toPath, fromDirFd, fromPath,
   1259                                   WriteMode::CREATE | WriteMode::MODIFY,
   1260                                   errorReason);
   1261     }
   1262   }
   1263 
   1264   template <typename T>
   1265   class ReplacerImpl final: public Directory::Replacer<T> {
   1266   public:
   1267     ReplacerImpl(Own<const T>&& object, const DiskHandle& handle,
   1268                  String&& tempPath, String&& path, WriteMode mode)
   1269         : Directory::Replacer<T>(mode),
   1270           object(kj::mv(object)), handle(handle),
   1271           tempPath(kj::mv(tempPath)), path(kj::mv(path)) {}
   1272 
   1273     ~ReplacerImpl() noexcept(false) {
   1274       if (!committed) {
   1275         rmrf(handle.fd, tempPath);
   1276       }
   1277     }
   1278 
   1279     const T& get() override {
   1280       return *object;
   1281     }
   1282 
   1283     bool tryCommit() override {
   1284       KJ_ASSERT(!committed, "already committed") { return false; }
   1285       return committed = handle.tryCommitReplacement(path, handle.fd, tempPath,
   1286                                                      Directory::Replacer<T>::mode);
   1287     }
   1288 
   1289   private:
   1290     Own<const T> object;
   1291     const DiskHandle& handle;
   1292     String tempPath;
   1293     String path;
   1294     bool committed = false;  // true if *successfully* committed (in which case tempPath is gone)
   1295   };
   1296 
   1297   template <typename T>
   1298   class BrokenReplacer final: public Directory::Replacer<T> {
   1299     // For recovery path when exceptions are disabled.
   1300 
   1301   public:
   1302     BrokenReplacer(Own<const T> inner)
   1303         : Directory::Replacer<T>(WriteMode::CREATE | WriteMode::MODIFY),
   1304           inner(kj::mv(inner)) {}
   1305 
   1306     const T& get() override { return *inner; }
   1307     bool tryCommit() override { return false; }
   1308 
   1309   private:
   1310     Own<const T> inner;
   1311   };
   1312 
   1313   Maybe<Own<const File>> tryOpenFile(PathPtr path, WriteMode mode) const {
   1314     return tryOpenFileInternal(path, mode, false).map(newDiskFile);
   1315   }
   1316 
   1317   Own<Directory::Replacer<File>> replaceFile(PathPtr path, WriteMode mode) const {
   1318     mode_t acl = 0666;
   1319     if (has(mode, WriteMode::EXECUTABLE)) {
   1320       acl = 0777;
   1321     }
   1322     if (has(mode, WriteMode::PRIVATE)) {
   1323       acl &= 0700;
   1324     }
   1325 
   1326     int newFd_;
   1327     KJ_IF_MAYBE(temp, createNamedTemporary(path, mode,
   1328         [&](StringPtr candidatePath) {
   1329       return newFd_ = openat(fd, candidatePath.cStr(),
   1330                              O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, acl);
   1331     })) {
   1332       AutoCloseFd newFd(newFd_);
   1333 #ifndef O_CLOEXEC
   1334       setCloexec(newFd);
   1335 #endif
   1336       return heap<ReplacerImpl<File>>(newDiskFile(kj::mv(newFd)), *this, kj::mv(*temp),
   1337                                       path.toString(), mode);
   1338     } else {
   1339       // threw, but exceptions are disabled
   1340       return heap<BrokenReplacer<File>>(newInMemoryFile(nullClock()));
   1341     }
   1342   }
   1343 
   1344   Own<const File> createTemporary() const {
   1345     int newFd_;
   1346 
   1347 #if __linux__ && defined(O_TMPFILE)
   1348     // Use syscall() to work around glibc bug with O_TMPFILE:
   1349     //     https://sourceware.org/bugzilla/show_bug.cgi?id=17523
   1350     KJ_SYSCALL_HANDLE_ERRORS(newFd_ = syscall(
   1351         SYS_openat, fd.get(), ".", O_RDWR | O_TMPFILE, 0700)) {
   1352       case EOPNOTSUPP:
   1353       case EINVAL:
   1354       case EISDIR:
   1355         // Maybe not supported by this kernel / filesystem. Fall back to below.
   1356         break;
   1357       default:
   1358         KJ_FAIL_SYSCALL("open(O_TMPFILE)", error) { break; }
   1359         break;
   1360     } else {
   1361       AutoCloseFd newFd(newFd_);
   1362 #ifndef O_CLOEXEC
   1363       setCloexec(newFd);
   1364 #endif
   1365       return newDiskFile(kj::mv(newFd));
   1366     }
   1367 #endif
   1368 
   1369     KJ_IF_MAYBE(temp, createNamedTemporary(Path("unnamed"), WriteMode::CREATE,
   1370         [&](StringPtr path) {
   1371       return newFd_ = openat(fd, path.cStr(), O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, 0600);
   1372     })) {
   1373       AutoCloseFd newFd(newFd_);
   1374 #ifndef O_CLOEXEC
   1375       setCloexec(newFd);
   1376 #endif
   1377       auto result = newDiskFile(kj::mv(newFd));
   1378       KJ_SYSCALL(unlinkat(fd, temp->cStr(), 0)) { break; }
   1379       return kj::mv(result);
   1380     } else {
   1381       // threw, but exceptions are disabled
   1382       return newInMemoryFile(nullClock());
   1383     }
   1384   }
   1385 
   1386   Maybe<Own<AppendableFile>> tryAppendFile(PathPtr path, WriteMode mode) const {
   1387     return tryOpenFileInternal(path, mode, true).map(newDiskAppendableFile);
   1388   }
   1389 
   1390   Maybe<Own<const Directory>> tryOpenSubdir(PathPtr path, WriteMode mode) const {
   1391     // Must create before open.
   1392     if (has(mode, WriteMode::CREATE)) {
   1393       if (!tryMkdir(path, mode, false)) return nullptr;
   1394     }
   1395 
   1396     return tryOpenSubdirInternal(path).map(newDiskDirectory);
   1397   }
   1398 
   1399   Own<Directory::Replacer<Directory>> replaceSubdir(PathPtr path, WriteMode mode) const {
   1400     mode_t acl = has(mode, WriteMode::PRIVATE) ? 0700 : 0777;
   1401 
   1402     KJ_IF_MAYBE(temp, createNamedTemporary(path, mode,
   1403         [&](StringPtr candidatePath) {
   1404       return mkdirat(fd, candidatePath.cStr(), acl);
   1405     })) {
   1406       int subdirFd_;
   1407       KJ_SYSCALL_HANDLE_ERRORS(subdirFd_ = openat(
   1408           fd, temp->cStr(), O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)) {
   1409         default:
   1410           KJ_FAIL_SYSCALL("open(just-created-temporary)", error);
   1411           return heap<BrokenReplacer<Directory>>(newInMemoryDirectory(nullClock()));
   1412       }
   1413 
   1414       AutoCloseFd subdirFd(subdirFd_);
   1415 #ifndef O_CLOEXEC
   1416       setCloexec(subdirFd);
   1417 #endif
   1418       return heap<ReplacerImpl<Directory>>(
   1419           newDiskDirectory(kj::mv(subdirFd)), *this, kj::mv(*temp), path.toString(), mode);
   1420     } else {
   1421       // threw, but exceptions are disabled
   1422       return heap<BrokenReplacer<Directory>>(newInMemoryDirectory(nullClock()));
   1423     }
   1424   }
   1425 
   1426   bool trySymlink(PathPtr linkpath, StringPtr content, WriteMode mode) const {
   1427     return tryReplaceNode(linkpath, mode, [&](StringPtr candidatePath) {
   1428       return symlinkat(content.cStr(), fd, candidatePath.cStr());
   1429     });
   1430   }
   1431 
   1432   bool tryTransfer(PathPtr toPath, WriteMode toMode,
   1433                    const Directory& fromDirectory, PathPtr fromPath,
   1434                    TransferMode mode, const Directory& self) const {
   1435     KJ_REQUIRE(toPath.size() > 0, "can't replace self") { return false; }
   1436 
   1437     if (mode == TransferMode::LINK) {
   1438       KJ_IF_MAYBE(fromFd, fromDirectory.getFd()) {
   1439         // Other is a disk directory, so we can hopefully do an efficient move/link.
   1440         return tryReplaceNode(toPath, toMode, [&](StringPtr candidatePath) {
   1441           return linkat(*fromFd, fromPath.toString().cStr(), fd, candidatePath.cStr(), 0);
   1442         });
   1443       };
   1444     } else if (mode == TransferMode::MOVE) {
   1445       KJ_IF_MAYBE(fromFd, fromDirectory.getFd()) {
   1446         KJ_ASSERT(mode == TransferMode::MOVE);
   1447 
   1448         int error = 0;
   1449         if (tryCommitReplacement(toPath.toString(), *fromFd, fromPath.toString(), toMode,
   1450                                  &error)) {
   1451           return true;
   1452         } else switch (error) {
   1453           case 0:
   1454             // Plain old WriteMode precondition failure.
   1455             return false;
   1456           case EXDEV:
   1457             // Can't move between devices. Fall back to default implementation, which does
   1458             // copy/delete.
   1459             break;
   1460           case ENOENT:
   1461             // Either the destination directory doesn't exist or the source path doesn't exist.
   1462             // Unfortunately we don't really know. If CREATE_PARENT was provided, try creating
   1463             // the parent directory. Otherwise, we don't actually need to distinguish between
   1464             // these two errors; just return false.
   1465             if (has(toMode, WriteMode::CREATE) && has(toMode, WriteMode::CREATE_PARENT) &&
   1466                 toPath.size() > 0 && tryMkdir(toPath.parent(),
   1467                     WriteMode::CREATE | WriteMode::MODIFY | WriteMode::CREATE_PARENT, true)) {
   1468               // Retry, but make sure we don't try to create the parent again.
   1469               return tryTransfer(toPath, toMode - WriteMode::CREATE_PARENT,
   1470                                  fromDirectory, fromPath, mode, self);
   1471             }
   1472             return false;
   1473           default:
   1474             KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) {
   1475               return false;
   1476             }
   1477         }
   1478       }
   1479     }
   1480 
   1481     // OK, we can't do anything efficient using the OS. Fall back to default implementation.
   1482     return self.Directory::tryTransfer(toPath, toMode, fromDirectory, fromPath, mode);
   1483   }
   1484 
   1485   bool tryRemove(PathPtr path) const {
   1486     return rmrf(fd, path.toString());
   1487   }
   1488 
   1489 protected:
   1490   AutoCloseFd fd;
   1491 };
   1492 
   1493 #define FSNODE_METHODS(classname)                                   \
   1494   Maybe<int> getFd() const override { return DiskHandle::getFd(); } \
   1495                                                                     \
   1496   Own<const FsNode> cloneFsNode() const override {                  \
   1497     return heap<classname>(DiskHandle::clone());                    \
   1498   }                                                                 \
   1499                                                                     \
   1500   Metadata stat() const override { return DiskHandle::stat(); }     \
   1501   void sync() const override { DiskHandle::sync(); }                \
   1502   void datasync() const override { DiskHandle::datasync(); }
   1503 
   1504 class DiskReadableFile final: public ReadableFile, public DiskHandle {
   1505 public:
   1506   DiskReadableFile(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {}
   1507 
   1508   FSNODE_METHODS(DiskReadableFile);
   1509 
   1510   size_t read(uint64_t offset, ArrayPtr<byte> buffer) const override {
   1511     return DiskHandle::read(offset, buffer);
   1512   }
   1513   Array<const byte> mmap(uint64_t offset, uint64_t size) const override {
   1514     return DiskHandle::mmap(offset, size);
   1515   }
   1516   Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const override {
   1517     return DiskHandle::mmapPrivate(offset, size);
   1518   }
   1519 };
   1520 
   1521 class DiskAppendableFile final: public AppendableFile, public DiskHandle, public FdOutputStream {
   1522 public:
   1523   DiskAppendableFile(AutoCloseFd&& fd)
   1524       : DiskHandle(kj::mv(fd)),
   1525         FdOutputStream(DiskHandle::fd.get()) {}
   1526 
   1527   FSNODE_METHODS(DiskAppendableFile);
   1528 
   1529   void write(const void* buffer, size_t size) override {
   1530     FdOutputStream::write(buffer, size);
   1531   }
   1532   void write(ArrayPtr<const ArrayPtr<const byte>> pieces) override {
   1533     FdOutputStream::write(pieces);
   1534   }
   1535 };
   1536 
   1537 class DiskFile final: public File, public DiskHandle {
   1538 public:
   1539   DiskFile(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {}
   1540 
   1541   FSNODE_METHODS(DiskFile);
   1542 
   1543   size_t read(uint64_t offset, ArrayPtr<byte> buffer) const override {
   1544     return DiskHandle::read(offset, buffer);
   1545   }
   1546   Array<const byte> mmap(uint64_t offset, uint64_t size) const override {
   1547     return DiskHandle::mmap(offset, size);
   1548   }
   1549   Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const override {
   1550     return DiskHandle::mmapPrivate(offset, size);
   1551   }
   1552 
   1553   void write(uint64_t offset, ArrayPtr<const byte> data) const override {
   1554     DiskHandle::write(offset, data);
   1555   }
   1556   void zero(uint64_t offset, uint64_t size) const override {
   1557     DiskHandle::zero(offset, size);
   1558   }
   1559   void truncate(uint64_t size) const override {
   1560     DiskHandle::truncate(size);
   1561   }
   1562   Own<const WritableFileMapping> mmapWritable(uint64_t offset, uint64_t size) const override {
   1563     return DiskHandle::mmapWritable(offset, size);
   1564   }
   1565   size_t copy(uint64_t offset, const ReadableFile& from,
   1566               uint64_t fromOffset, uint64_t size) const override {
   1567     KJ_IF_MAYBE(result, DiskHandle::copy(offset, from, fromOffset, size)) {
   1568       return *result;
   1569     } else {
   1570       return File::copy(offset, from, fromOffset, size);
   1571     }
   1572   }
   1573 };
   1574 
   1575 class DiskReadableDirectory final: public ReadableDirectory, public DiskHandle {
   1576 public:
   1577   DiskReadableDirectory(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {}
   1578 
   1579   FSNODE_METHODS(DiskReadableDirectory);
   1580 
   1581   Array<String> listNames() const override { return DiskHandle::listNames(); }
   1582   Array<Entry> listEntries() const override { return DiskHandle::listEntries(); }
   1583   bool exists(PathPtr path) const override { return DiskHandle::exists(path); }
   1584   Maybe<FsNode::Metadata> tryLstat(PathPtr path) const override {
   1585     return DiskHandle::tryLstat(path);
   1586   }
   1587   Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const override {
   1588     return DiskHandle::tryOpenFile(path);
   1589   }
   1590   Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const override {
   1591     return DiskHandle::tryOpenSubdir(path);
   1592   }
   1593   Maybe<String> tryReadlink(PathPtr path) const override { return DiskHandle::tryReadlink(path); }
   1594 };
   1595 
   1596 class DiskDirectory final: public Directory, public DiskHandle {
   1597 public:
   1598   DiskDirectory(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {}
   1599 
   1600   FSNODE_METHODS(DiskDirectory);
   1601 
   1602   Array<String> listNames() const override { return DiskHandle::listNames(); }
   1603   Array<Entry> listEntries() const override { return DiskHandle::listEntries(); }
   1604   bool exists(PathPtr path) const override { return DiskHandle::exists(path); }
   1605   Maybe<FsNode::Metadata> tryLstat(PathPtr path) const override {
   1606     return DiskHandle::tryLstat(path);
   1607   }
   1608   Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const override {
   1609     return DiskHandle::tryOpenFile(path);
   1610   }
   1611   Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const override {
   1612     return DiskHandle::tryOpenSubdir(path);
   1613   }
   1614   Maybe<String> tryReadlink(PathPtr path) const override { return DiskHandle::tryReadlink(path); }
   1615 
   1616   Maybe<Own<const File>> tryOpenFile(PathPtr path, WriteMode mode) const override {
   1617     return DiskHandle::tryOpenFile(path, mode);
   1618   }
   1619   Own<Replacer<File>> replaceFile(PathPtr path, WriteMode mode) const override {
   1620     return DiskHandle::replaceFile(path, mode);
   1621   }
   1622   Own<const File> createTemporary() const override {
   1623     return DiskHandle::createTemporary();
   1624   }
   1625   Maybe<Own<AppendableFile>> tryAppendFile(PathPtr path, WriteMode mode) const override {
   1626     return DiskHandle::tryAppendFile(path, mode);
   1627   }
   1628   Maybe<Own<const Directory>> tryOpenSubdir(PathPtr path, WriteMode mode) const override {
   1629     return DiskHandle::tryOpenSubdir(path, mode);
   1630   }
   1631   Own<Replacer<Directory>> replaceSubdir(PathPtr path, WriteMode mode) const override {
   1632     return DiskHandle::replaceSubdir(path, mode);
   1633   }
   1634   bool trySymlink(PathPtr linkpath, StringPtr content, WriteMode mode) const override {
   1635     return DiskHandle::trySymlink(linkpath, content, mode);
   1636   }
   1637   bool tryTransfer(PathPtr toPath, WriteMode toMode,
   1638                    const Directory& fromDirectory, PathPtr fromPath,
   1639                    TransferMode mode) const override {
   1640     return DiskHandle::tryTransfer(toPath, toMode, fromDirectory, fromPath, mode, *this);
   1641   }
   1642   // tryTransferTo() not implemented because we have nothing special we can do.
   1643   bool tryRemove(PathPtr path) const override {
   1644     return DiskHandle::tryRemove(path);
   1645   }
   1646 };
   1647 
   1648 class DiskFilesystem final: public Filesystem {
   1649 public:
   1650   DiskFilesystem()
   1651       : root(openDir("/")),
   1652         current(openDir(".")),
   1653         currentPath(computeCurrentPath()) {}
   1654 
   1655   const Directory& getRoot() const override {
   1656     return root;
   1657   }
   1658 
   1659   const Directory& getCurrent() const override {
   1660     return current;
   1661   }
   1662 
   1663   PathPtr getCurrentPath() const override {
   1664     return currentPath;
   1665   }
   1666 
   1667 private:
   1668   DiskDirectory root;
   1669   DiskDirectory current;
   1670   Path currentPath;
   1671 
   1672   static AutoCloseFd openDir(const char* dir) {
   1673     int newFd;
   1674     KJ_SYSCALL(newFd = open(dir, O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY));
   1675     AutoCloseFd result(newFd);
   1676 #ifndef O_CLOEXEC
   1677     setCloexec(result);
   1678 #endif
   1679     return result;
   1680   }
   1681 
   1682   static Path computeCurrentPath() {
   1683     // If env var PWD is set and points to the current directory, use it. This captures the current
   1684     // path according to the user's shell, which may differ from the kernel's idea in the presence
   1685     // of symlinks.
   1686     const char* pwd = getenv("PWD");
   1687     if (pwd != nullptr) {
   1688       Path result = nullptr;
   1689       struct stat pwdStat, dotStat;
   1690       KJ_IF_MAYBE(e, kj::runCatchingExceptions([&]() {
   1691         KJ_ASSERT(pwd[0] == '/') { return; }
   1692         result = Path::parse(pwd + 1);
   1693         KJ_SYSCALL(lstat(result.toString(true).cStr(), &pwdStat), result) { return; }
   1694         KJ_SYSCALL(lstat(".", &dotStat)) { return; }
   1695       })) {
   1696         // failed, give up on PWD
   1697         KJ_LOG(WARNING, "PWD environment variable seems invalid", pwd, *e);
   1698       } else {
   1699         if (pwdStat.st_ino == dotStat.st_ino &&
   1700             pwdStat.st_dev == dotStat.st_dev) {
   1701           return kj::mv(result);
   1702         } else {
   1703           KJ_LOG(WARNING, "PWD environment variable doesn't match current directory", pwd);
   1704         }
   1705       }
   1706     }
   1707 
   1708     size_t size = 256;
   1709   retry:
   1710     KJ_STACK_ARRAY(char, buf, size, 256, 4096);
   1711     if (getcwd(buf.begin(), size) == nullptr) {
   1712       int error = errno;
   1713       if (error == ENAMETOOLONG) {
   1714         size *= 2;
   1715         goto retry;
   1716       } else {
   1717         KJ_FAIL_SYSCALL("getcwd()", error);
   1718       }
   1719     }
   1720 
   1721     StringPtr path = buf.begin();
   1722 
   1723     // On Linux, the path will start with "(unreachable)" if the working directory is not a subdir
   1724     // of the root directory, which is possible via chroot() or mount namespaces.
   1725     KJ_ASSERT(!path.startsWith("(unreachable)"),
   1726         "working directory is not reachable from root", path);
   1727     KJ_ASSERT(path.startsWith("/"), "current directory is not absolute", path);
   1728 
   1729     return Path::parse(path.slice(1));
   1730   }
   1731 };
   1732 
   1733 } // namespace
   1734 
   1735 Own<ReadableFile> newDiskReadableFile(kj::AutoCloseFd fd) {
   1736   return heap<DiskReadableFile>(kj::mv(fd));
   1737 }
   1738 Own<AppendableFile> newDiskAppendableFile(kj::AutoCloseFd fd) {
   1739   return heap<DiskAppendableFile>(kj::mv(fd));
   1740 }
   1741 Own<File> newDiskFile(kj::AutoCloseFd fd) {
   1742   return heap<DiskFile>(kj::mv(fd));
   1743 }
   1744 Own<ReadableDirectory> newDiskReadableDirectory(kj::AutoCloseFd fd) {
   1745   return heap<DiskReadableDirectory>(kj::mv(fd));
   1746 }
   1747 Own<Directory> newDiskDirectory(kj::AutoCloseFd fd) {
   1748   return heap<DiskDirectory>(kj::mv(fd));
   1749 }
   1750 
   1751 Own<Filesystem> newDiskFilesystem() {
   1752   return heap<DiskFilesystem>();
   1753 }
   1754 
   1755 } // namespace kj
   1756 
   1757 #endif  // !_WIN32