qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

passthrough_ll.c (121610B)


      1 /*
      2  * FUSE: Filesystem in Userspace
      3  * Copyright (C) 2001-2007  Miklos Szeredi <miklos@szeredi.hu>
      4  *
      5  * This program can be distributed under the terms of the GNU GPLv2.
      6  * See the file COPYING.
      7  */
      8 
      9 /*
     10  *
     11  * This file system mirrors the existing file system hierarchy of the
     12  * system, starting at the root file system. This is implemented by
     13  * just "passing through" all requests to the corresponding user-space
     14  * libc functions. In contrast to passthrough.c and passthrough_fh.c,
     15  * this implementation uses the low-level API. Its performance should
     16  * be the least bad among the three, but many operations are not
     17  * implemented. In particular, it is not possible to remove files (or
     18  * directories) because the code necessary to defer actual removal
     19  * until the file is not opened anymore would make the example much
     20  * more complicated.
     21  *
     22  * When writeback caching is enabled (-o writeback mount option), it
     23  * is only possible to write to files for which the mounting user has
     24  * read permissions. This is because the writeback cache requires the
     25  * kernel to be able to issue read requests for all files (which the
     26  * passthrough filesystem cannot satisfy if it can't read the file in
     27  * the underlying filesystem).
     28  *
     29  * Compile with:
     30  *
     31  *     gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
     32  * passthrough_ll
     33  *
     34  * ## Source code ##
     35  * \include passthrough_ll.c
     36  */
     37 
     38 #include "qemu/osdep.h"
     39 #include "qemu/timer.h"
     40 #include "qemu-version.h"
     41 #include "qemu/help-texts.h"
     42 #include "fuse_virtio.h"
     43 #include "fuse_log.h"
     44 #include "fuse_lowlevel.h"
     45 #include "standard-headers/linux/fuse.h"
     46 #include <cap-ng.h>
     47 #include <dirent.h>
     48 #include <pthread.h>
     49 #include <sys/file.h>
     50 #include <sys/mount.h>
     51 #include <sys/prctl.h>
     52 #include <sys/resource.h>
     53 #include <sys/syscall.h>
     54 #include <sys/wait.h>
     55 #include <sys/xattr.h>
     56 #include <syslog.h>
     57 #include <grp.h>
     58 
     59 #include "qemu/cutils.h"
     60 #include "passthrough_helpers.h"
     61 #include "passthrough_seccomp.h"
     62 
     63 /* Keep track of inode posix locks for each owner. */
     64 struct lo_inode_plock {
     65     uint64_t lock_owner;
     66     int fd; /* fd for OFD locks */
     67 };
     68 
     69 struct lo_map_elem {
     70     union {
     71         struct lo_inode *inode;
     72         struct lo_dirp *dirp;
     73         int fd;
     74         ssize_t freelist;
     75     };
     76     bool in_use;
     77 };
     78 
     79 /* Maps FUSE fh or ino values to internal objects */
     80 struct lo_map {
     81     struct lo_map_elem *elems;
     82     size_t nelems;
     83     ssize_t freelist;
     84 };
     85 
     86 struct lo_key {
     87     ino_t ino;
     88     dev_t dev;
     89     uint64_t mnt_id;
     90 };
     91 
     92 struct lo_inode {
     93     int fd;
     94 
     95     /*
     96      * Atomic reference count for this object.  The nlookup field holds a
     97      * reference and release it when nlookup reaches 0.
     98      */
     99     gint refcount;
    100 
    101     struct lo_key key;
    102 
    103     /*
    104      * This counter keeps the inode alive during the FUSE session.
    105      * Incremented when the FUSE inode number is sent in a reply
    106      * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc).  Decremented when an inode is
    107      * released by a FUSE_FORGET request.
    108      *
    109      * Note that this value is untrusted because the client can manipulate
    110      * it arbitrarily using FUSE_FORGET requests.
    111      *
    112      * Protected by lo->mutex.
    113      */
    114     uint64_t nlookup;
    115 
    116     fuse_ino_t fuse_ino;
    117     pthread_mutex_t plock_mutex;
    118     GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
    119 
    120     mode_t filetype;
    121 };
    122 
    123 struct lo_cred {
    124     uid_t euid;
    125     gid_t egid;
    126     mode_t umask;
    127 };
    128 
    129 enum {
    130     CACHE_NONE,
    131     CACHE_AUTO,
    132     CACHE_ALWAYS,
    133 };
    134 
    135 enum {
    136     SANDBOX_NAMESPACE,
    137     SANDBOX_CHROOT,
    138 };
    139 
    140 typedef struct xattr_map_entry {
    141     char *key;
    142     char *prepend;
    143     unsigned int flags;
    144 } XattrMapEntry;
    145 
    146 struct lo_data {
    147     pthread_mutex_t mutex;
    148     int sandbox;
    149     int debug;
    150     int writeback;
    151     int flock;
    152     int posix_lock;
    153     int xattr;
    154     char *xattrmap;
    155     char *xattr_security_capability;
    156     char *source;
    157     char *modcaps;
    158     double timeout;
    159     int cache;
    160     int timeout_set;
    161     int readdirplus_set;
    162     int readdirplus_clear;
    163     int allow_direct_io;
    164     int announce_submounts;
    165     bool use_statx;
    166     struct lo_inode root;
    167     GHashTable *inodes; /* protected by lo->mutex */
    168     struct lo_map ino_map; /* protected by lo->mutex */
    169     struct lo_map dirp_map; /* protected by lo->mutex */
    170     struct lo_map fd_map; /* protected by lo->mutex */
    171     XattrMapEntry *xattr_map_list;
    172     size_t xattr_map_nentries;
    173 
    174     /* An O_PATH file descriptor to /proc/self/fd/ */
    175     int proc_self_fd;
    176     /* An O_PATH file descriptor to /proc/self/task/ */
    177     int proc_self_task;
    178     int user_killpriv_v2, killpriv_v2;
    179     /* If set, virtiofsd is responsible for setting umask during creation */
    180     bool change_umask;
    181     int user_posix_acl, posix_acl;
    182     /* Keeps track if /proc/<pid>/attr/fscreate should be used or not */
    183     bool use_fscreate;
    184     int user_security_label;
    185 };
    186 
    187 static const struct fuse_opt lo_opts[] = {
    188     { "sandbox=namespace",
    189       offsetof(struct lo_data, sandbox),
    190       SANDBOX_NAMESPACE },
    191     { "sandbox=chroot",
    192       offsetof(struct lo_data, sandbox),
    193       SANDBOX_CHROOT },
    194     { "writeback", offsetof(struct lo_data, writeback), 1 },
    195     { "no_writeback", offsetof(struct lo_data, writeback), 0 },
    196     { "source=%s", offsetof(struct lo_data, source), 0 },
    197     { "flock", offsetof(struct lo_data, flock), 1 },
    198     { "no_flock", offsetof(struct lo_data, flock), 0 },
    199     { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
    200     { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
    201     { "xattr", offsetof(struct lo_data, xattr), 1 },
    202     { "no_xattr", offsetof(struct lo_data, xattr), 0 },
    203     { "xattrmap=%s", offsetof(struct lo_data, xattrmap), 0 },
    204     { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 },
    205     { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
    206     { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
    207     { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
    208     { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
    209     { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
    210     { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
    211     { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
    212     { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 },
    213     { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 },
    214     { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 },
    215     { "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 },
    216     { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 },
    217     { "posix_acl", offsetof(struct lo_data, user_posix_acl), 1 },
    218     { "no_posix_acl", offsetof(struct lo_data, user_posix_acl), 0 },
    219     { "security_label", offsetof(struct lo_data, user_security_label), 1 },
    220     { "no_security_label", offsetof(struct lo_data, user_security_label), 0 },
    221     FUSE_OPT_END
    222 };
    223 static bool use_syslog = false;
    224 static int current_log_level;
    225 static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
    226                                  uint64_t n);
    227 
    228 static struct {
    229     pthread_mutex_t mutex;
    230     void *saved;
    231 } cap;
    232 /* That we loaded cap-ng in the current thread from the saved */
    233 static __thread bool cap_loaded = 0;
    234 
    235 static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
    236                                 uint64_t mnt_id);
    237 static int xattr_map_client(const struct lo_data *lo, const char *client_name,
    238                             char **out_name);
    239 
    240 #define FCHDIR_NOFAIL(fd) do {                         \
    241         int fchdir_res = fchdir(fd);                   \
    242         assert(fchdir_res == 0);                       \
    243     } while (0)
    244 
    245 static bool is_dot_or_dotdot(const char *name)
    246 {
    247     return name[0] == '.' &&
    248            (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
    249 }
    250 
    251 /* Is `path` a single path component that is not "." or ".."? */
    252 static bool is_safe_path_component(const char *path)
    253 {
    254     if (strchr(path, '/')) {
    255         return false;
    256     }
    257 
    258     return !is_dot_or_dotdot(path);
    259 }
    260 
    261 static bool is_empty(const char *name)
    262 {
    263     return name[0] == '\0';
    264 }
    265 
    266 static struct lo_data *lo_data(fuse_req_t req)
    267 {
    268     return (struct lo_data *)fuse_req_userdata(req);
    269 }
    270 
    271 /*
    272  * Tries to figure out if /proc/<pid>/attr/fscreate is usable or not. With
    273  * selinux=0, read from fscreate returns -EINVAL.
    274  *
    275  * TODO: Link with libselinux and use is_selinux_enabled() instead down
    276  * the line. It probably will be more reliable indicator.
    277  */
    278 static bool is_fscreate_usable(struct lo_data *lo)
    279 {
    280     char procname[64];
    281     int fscreate_fd;
    282     size_t bytes_read;
    283 
    284     sprintf(procname, "%ld/attr/fscreate", syscall(SYS_gettid));
    285     fscreate_fd = openat(lo->proc_self_task, procname, O_RDWR);
    286     if (fscreate_fd == -1) {
    287         return false;
    288     }
    289 
    290     bytes_read = read(fscreate_fd, procname, 64);
    291     close(fscreate_fd);
    292     if (bytes_read == -1) {
    293         return false;
    294     }
    295     return true;
    296 }
    297 
    298 /* Helpers to set/reset fscreate */
    299 static int open_set_proc_fscreate(struct lo_data *lo, const void *ctx,
    300                                   size_t ctxlen, int *fd)
    301 {
    302     char procname[64];
    303     int fscreate_fd, err = 0;
    304     size_t written;
    305 
    306     sprintf(procname, "%ld/attr/fscreate", syscall(SYS_gettid));
    307     fscreate_fd = openat(lo->proc_self_task, procname, O_WRONLY);
    308     err = fscreate_fd == -1 ? errno : 0;
    309     if (err) {
    310         return err;
    311     }
    312 
    313     written = write(fscreate_fd, ctx, ctxlen);
    314     err = written == -1 ? errno : 0;
    315     if (err) {
    316         goto out;
    317     }
    318 
    319     *fd = fscreate_fd;
    320     return 0;
    321 out:
    322     close(fscreate_fd);
    323     return err;
    324 }
    325 
    326 static void close_reset_proc_fscreate(int fd)
    327 {
    328     if ((write(fd, NULL, 0)) == -1) {
    329         fuse_log(FUSE_LOG_WARNING, "Failed to reset fscreate. err=%d\n", errno);
    330     }
    331     close(fd);
    332     return;
    333 }
    334 
    335 /*
    336  * Load capng's state from our saved state if the current thread
    337  * hadn't previously been loaded.
    338  * returns 0 on success
    339  */
    340 static int load_capng(void)
    341 {
    342     if (!cap_loaded) {
    343         pthread_mutex_lock(&cap.mutex);
    344         capng_restore_state(&cap.saved);
    345         /*
    346          * restore_state free's the saved copy
    347          * so make another.
    348          */
    349         cap.saved = capng_save_state();
    350         if (!cap.saved) {
    351             pthread_mutex_unlock(&cap.mutex);
    352             fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
    353             return -EINVAL;
    354         }
    355         pthread_mutex_unlock(&cap.mutex);
    356 
    357         /*
    358          * We want to use the loaded state for our pid,
    359          * not the original
    360          */
    361         capng_setpid(syscall(SYS_gettid));
    362         cap_loaded = true;
    363     }
    364     return 0;
    365 }
    366 
    367 /*
    368  * Helpers for dropping and regaining effective capabilities. Returns 0
    369  * on success, error otherwise
    370  */
    371 static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
    372 {
    373     int cap, ret;
    374 
    375     cap = capng_name_to_capability(cap_name);
    376     if (cap < 0) {
    377         ret = errno;
    378         fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
    379                  cap_name, strerror(errno));
    380         goto out;
    381     }
    382 
    383     if (load_capng()) {
    384         ret = errno;
    385         fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
    386         goto out;
    387     }
    388 
    389     /* We dont have this capability in effective set already. */
    390     if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
    391         ret = 0;
    392         goto out;
    393     }
    394 
    395     if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
    396         ret = errno;
    397         fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
    398         goto out;
    399     }
    400 
    401     if (capng_apply(CAPNG_SELECT_CAPS)) {
    402         ret = errno;
    403         fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
    404         goto out;
    405     }
    406 
    407     ret = 0;
    408     if (cap_dropped) {
    409         *cap_dropped = true;
    410     }
    411 
    412 out:
    413     return ret;
    414 }
    415 
    416 static int gain_effective_cap(const char *cap_name)
    417 {
    418     int cap;
    419     int ret = 0;
    420 
    421     cap = capng_name_to_capability(cap_name);
    422     if (cap < 0) {
    423         ret = errno;
    424         fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
    425                  cap_name, strerror(errno));
    426         goto out;
    427     }
    428 
    429     if (load_capng()) {
    430         ret = errno;
    431         fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
    432         goto out;
    433     }
    434 
    435     if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
    436         ret = errno;
    437         fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
    438         goto out;
    439     }
    440 
    441     if (capng_apply(CAPNG_SELECT_CAPS)) {
    442         ret = errno;
    443         fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
    444         goto out;
    445     }
    446     ret = 0;
    447 
    448 out:
    449     return ret;
    450 }
    451 
    452 /*
    453  * The host kernel normally drops security.capability xattr's on
    454  * any write, however if we're remapping xattr names we need to drop
    455  * whatever the clients security.capability is actually stored as.
    456  */
    457 static int drop_security_capability(const struct lo_data *lo, int fd)
    458 {
    459     if (!lo->xattr_security_capability) {
    460         /* We didn't remap the name, let the host kernel do it */
    461         return 0;
    462     }
    463     if (!fremovexattr(fd, lo->xattr_security_capability)) {
    464         /* All good */
    465         return 0;
    466     }
    467 
    468     switch (errno) {
    469     case ENODATA:
    470         /* Attribute didn't exist, that's fine */
    471         return 0;
    472 
    473     case ENOTSUP:
    474         /* FS didn't support attribute anyway, also fine */
    475         return 0;
    476 
    477     default:
    478         /* Hmm other error */
    479         return errno;
    480     }
    481 }
    482 
    483 static void lo_map_init(struct lo_map *map)
    484 {
    485     map->elems = NULL;
    486     map->nelems = 0;
    487     map->freelist = -1;
    488 }
    489 
    490 static void lo_map_destroy(struct lo_map *map)
    491 {
    492     g_free(map->elems);
    493 }
    494 
    495 static int lo_map_grow(struct lo_map *map, size_t new_nelems)
    496 {
    497     struct lo_map_elem *new_elems;
    498     size_t i;
    499 
    500     if (new_nelems <= map->nelems) {
    501         return 1;
    502     }
    503 
    504     new_elems = g_try_realloc_n(map->elems, new_nelems, sizeof(map->elems[0]));
    505     if (!new_elems) {
    506         return 0;
    507     }
    508 
    509     for (i = map->nelems; i < new_nelems; i++) {
    510         new_elems[i].freelist = i + 1;
    511         new_elems[i].in_use = false;
    512     }
    513     new_elems[new_nelems - 1].freelist = -1;
    514 
    515     map->elems = new_elems;
    516     map->freelist = map->nelems;
    517     map->nelems = new_nelems;
    518     return 1;
    519 }
    520 
    521 static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
    522 {
    523     struct lo_map_elem *elem;
    524 
    525     if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
    526         return NULL;
    527     }
    528 
    529     elem = &map->elems[map->freelist];
    530     map->freelist = elem->freelist;
    531 
    532     elem->in_use = true;
    533 
    534     return elem;
    535 }
    536 
    537 static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
    538 {
    539     ssize_t *prev;
    540 
    541     if (!lo_map_grow(map, key + 1)) {
    542         return NULL;
    543     }
    544 
    545     for (prev = &map->freelist; *prev != -1;
    546          prev = &map->elems[*prev].freelist) {
    547         if (*prev == key) {
    548             struct lo_map_elem *elem = &map->elems[key];
    549 
    550             *prev = elem->freelist;
    551             elem->in_use = true;
    552             return elem;
    553         }
    554     }
    555     return NULL;
    556 }
    557 
    558 static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
    559 {
    560     if (key >= map->nelems) {
    561         return NULL;
    562     }
    563     if (!map->elems[key].in_use) {
    564         return NULL;
    565     }
    566     return &map->elems[key];
    567 }
    568 
    569 static void lo_map_remove(struct lo_map *map, size_t key)
    570 {
    571     struct lo_map_elem *elem;
    572 
    573     if (key >= map->nelems) {
    574         return;
    575     }
    576 
    577     elem = &map->elems[key];
    578     if (!elem->in_use) {
    579         return;
    580     }
    581 
    582     elem->in_use = false;
    583 
    584     elem->freelist = map->freelist;
    585     map->freelist = key;
    586 }
    587 
    588 /* Assumes lo->mutex is held */
    589 static ssize_t lo_add_fd_mapping(struct lo_data *lo, int fd)
    590 {
    591     struct lo_map_elem *elem;
    592 
    593     elem = lo_map_alloc_elem(&lo->fd_map);
    594     if (!elem) {
    595         return -1;
    596     }
    597 
    598     elem->fd = fd;
    599     return elem - lo->fd_map.elems;
    600 }
    601 
    602 /* Assumes lo->mutex is held */
    603 static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
    604 {
    605     struct lo_map_elem *elem;
    606 
    607     elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
    608     if (!elem) {
    609         return -1;
    610     }
    611 
    612     elem->dirp = dirp;
    613     return elem - lo_data(req)->dirp_map.elems;
    614 }
    615 
    616 /* Assumes lo->mutex is held */
    617 static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
    618 {
    619     struct lo_map_elem *elem;
    620 
    621     elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
    622     if (!elem) {
    623         return -1;
    624     }
    625 
    626     elem->inode = inode;
    627     return elem - lo_data(req)->ino_map.elems;
    628 }
    629 
    630 static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
    631 {
    632     struct lo_inode *inode = *inodep;
    633 
    634     if (!inode) {
    635         return;
    636     }
    637 
    638     *inodep = NULL;
    639 
    640     if (g_atomic_int_dec_and_test(&inode->refcount)) {
    641         close(inode->fd);
    642         free(inode);
    643     }
    644 }
    645 
    646 /* Caller must release refcount using lo_inode_put() */
    647 static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
    648 {
    649     struct lo_data *lo = lo_data(req);
    650     struct lo_map_elem *elem;
    651 
    652     pthread_mutex_lock(&lo->mutex);
    653     elem = lo_map_get(&lo->ino_map, ino);
    654     if (elem) {
    655         g_atomic_int_inc(&elem->inode->refcount);
    656     }
    657     pthread_mutex_unlock(&lo->mutex);
    658 
    659     if (!elem) {
    660         return NULL;
    661     }
    662 
    663     return elem->inode;
    664 }
    665 
    666 /*
    667  * TODO Remove this helper and force callers to hold an inode refcount until
    668  * they are done with the fd.  This will be done in a later patch to make
    669  * review easier.
    670  */
    671 static int lo_fd(fuse_req_t req, fuse_ino_t ino)
    672 {
    673     struct lo_inode *inode = lo_inode(req, ino);
    674     int fd;
    675 
    676     if (!inode) {
    677         return -1;
    678     }
    679 
    680     fd = inode->fd;
    681     lo_inode_put(lo_data(req), &inode);
    682     return fd;
    683 }
    684 
    685 /*
    686  * Open a file descriptor for an inode. Returns -EBADF if the inode is not a
    687  * regular file or a directory.
    688  *
    689  * Use this helper function instead of raw openat(2) to prevent security issues
    690  * when a malicious client opens special files such as block device nodes.
    691  * Symlink inodes are also rejected since symlinks must already have been
    692  * traversed on the client side.
    693  */
    694 static int lo_inode_open(struct lo_data *lo, struct lo_inode *inode,
    695                          int open_flags)
    696 {
    697     g_autofree char *fd_str = g_strdup_printf("%d", inode->fd);
    698     int fd;
    699 
    700     if (!S_ISREG(inode->filetype) && !S_ISDIR(inode->filetype)) {
    701         return -EBADF;
    702     }
    703 
    704     /*
    705      * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier
    706      * that the inode is not a special file but if an external process races
    707      * with us then symlinks are traversed here. It is not possible to escape
    708      * the shared directory since it is mounted as "/" though.
    709      */
    710     fd = openat(lo->proc_self_fd, fd_str, open_flags & ~O_NOFOLLOW);
    711     if (fd < 0) {
    712         return -errno;
    713     }
    714     return fd;
    715 }
    716 
    717 static void lo_init(void *userdata, struct fuse_conn_info *conn)
    718 {
    719     struct lo_data *lo = (struct lo_data *)userdata;
    720 
    721     if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
    722         conn->want |= FUSE_CAP_EXPORT_SUPPORT;
    723     }
    724 
    725     if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
    726         fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
    727         conn->want |= FUSE_CAP_WRITEBACK_CACHE;
    728     }
    729     if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
    730         if (lo->flock) {
    731             fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
    732             conn->want |= FUSE_CAP_FLOCK_LOCKS;
    733         } else {
    734             fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
    735             conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
    736         }
    737     }
    738 
    739     if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
    740         if (lo->posix_lock) {
    741             fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
    742             conn->want |= FUSE_CAP_POSIX_LOCKS;
    743         } else {
    744             fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
    745             conn->want &= ~FUSE_CAP_POSIX_LOCKS;
    746         }
    747     }
    748 
    749     if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
    750         lo->readdirplus_clear) {
    751         fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
    752         conn->want &= ~FUSE_CAP_READDIRPLUS;
    753     }
    754 
    755     if (!(conn->capable & FUSE_CAP_SUBMOUNTS) && lo->announce_submounts) {
    756         fuse_log(FUSE_LOG_WARNING, "lo_init: Cannot announce submounts, client "
    757                  "does not support it\n");
    758         lo->announce_submounts = false;
    759     }
    760 
    761     if (lo->user_killpriv_v2 == 1) {
    762         /*
    763          * User explicitly asked for this option. Enable it unconditionally.
    764          * If connection does not have this capability, it should fail
    765          * in fuse_lowlevel.c
    766          */
    767         fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
    768         conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
    769         lo->killpriv_v2 = 1;
    770     } else {
    771         /*
    772          * Either user specified to disable killpriv_v2, or did not
    773          * specify anything. Disable killpriv_v2 in both the cases.
    774          */
    775         fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling killpriv_v2\n");
    776         conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2;
    777         lo->killpriv_v2 = 0;
    778     }
    779 
    780     if (lo->user_posix_acl == 1) {
    781         /*
    782          * User explicitly asked for this option. Enable it unconditionally.
    783          * If connection does not have this capability, print error message
    784          * now. It will fail later in fuse_lowlevel.c
    785          */
    786         if (!(conn->capable & FUSE_CAP_POSIX_ACL) ||
    787             !(conn->capable & FUSE_CAP_DONT_MASK) ||
    788             !(conn->capable & FUSE_CAP_SETXATTR_EXT)) {
    789             fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable posix acl."
    790                      " kernel does not support FUSE_POSIX_ACL, FUSE_DONT_MASK"
    791                      " or FUSE_SETXATTR_EXT capability.\n");
    792         } else {
    793             fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling posix acl\n");
    794         }
    795 
    796         conn->want |= FUSE_CAP_POSIX_ACL | FUSE_CAP_DONT_MASK |
    797                       FUSE_CAP_SETXATTR_EXT;
    798         lo->change_umask = true;
    799         lo->posix_acl = true;
    800     } else {
    801         /* User either did not specify anything or wants it disabled */
    802         fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix_acl\n");
    803         conn->want &= ~FUSE_CAP_POSIX_ACL;
    804     }
    805 
    806     if (lo->user_security_label == 1) {
    807         if (!(conn->capable & FUSE_CAP_SECURITY_CTX)) {
    808             fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable security label."
    809                      " kernel does not support FUSE_SECURITY_CTX capability.\n");
    810         }
    811         conn->want |= FUSE_CAP_SECURITY_CTX;
    812     } else {
    813         fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling security label\n");
    814         conn->want &= ~FUSE_CAP_SECURITY_CTX;
    815     }
    816 }
    817 
    818 static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
    819                        struct fuse_file_info *fi)
    820 {
    821     int res;
    822     struct stat buf;
    823     struct lo_data *lo = lo_data(req);
    824 
    825     (void)fi;
    826 
    827     res =
    828         fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
    829     if (res == -1) {
    830         return (void)fuse_reply_err(req, errno);
    831     }
    832 
    833     fuse_reply_attr(req, &buf, lo->timeout);
    834 }
    835 
    836 static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
    837 {
    838     struct lo_data *lo = lo_data(req);
    839     struct lo_map_elem *elem;
    840 
    841     pthread_mutex_lock(&lo->mutex);
    842     elem = lo_map_get(&lo->fd_map, fi->fh);
    843     pthread_mutex_unlock(&lo->mutex);
    844 
    845     if (!elem) {
    846         return -1;
    847     }
    848 
    849     return elem->fd;
    850 }
    851 
    852 static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
    853                        int valid, struct fuse_file_info *fi)
    854 {
    855     int saverr;
    856     char procname[64];
    857     struct lo_data *lo = lo_data(req);
    858     struct lo_inode *inode;
    859     int ifd;
    860     int res;
    861     int fd = -1;
    862 
    863     inode = lo_inode(req, ino);
    864     if (!inode) {
    865         fuse_reply_err(req, EBADF);
    866         return;
    867     }
    868 
    869     ifd = inode->fd;
    870 
    871     /* If fi->fh is invalid we'll report EBADF later */
    872     if (fi) {
    873         fd = lo_fi_fd(req, fi);
    874     }
    875 
    876     if (valid & FUSE_SET_ATTR_MODE) {
    877         if (fi) {
    878             res = fchmod(fd, attr->st_mode);
    879         } else {
    880             sprintf(procname, "%i", ifd);
    881             res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
    882         }
    883         if (res == -1) {
    884             saverr = errno;
    885             goto out_err;
    886         }
    887     }
    888     if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
    889         uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
    890         gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
    891 
    892         saverr = drop_security_capability(lo, ifd);
    893         if (saverr) {
    894             goto out_err;
    895         }
    896 
    897         res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
    898         if (res == -1) {
    899             saverr = errno;
    900             goto out_err;
    901         }
    902     }
    903     if (valid & FUSE_SET_ATTR_SIZE) {
    904         int truncfd;
    905         bool kill_suidgid;
    906         bool cap_fsetid_dropped = false;
    907 
    908         kill_suidgid = lo->killpriv_v2 && (valid & FUSE_SET_ATTR_KILL_SUIDGID);
    909         if (fi) {
    910             truncfd = fd;
    911         } else {
    912             truncfd = lo_inode_open(lo, inode, O_RDWR);
    913             if (truncfd < 0) {
    914                 saverr = -truncfd;
    915                 goto out_err;
    916             }
    917         }
    918 
    919         saverr = drop_security_capability(lo, truncfd);
    920         if (saverr) {
    921             if (!fi) {
    922                 close(truncfd);
    923             }
    924             goto out_err;
    925         }
    926 
    927         if (kill_suidgid) {
    928             res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
    929             if (res != 0) {
    930                 saverr = res;
    931                 if (!fi) {
    932                     close(truncfd);
    933                 }
    934                 goto out_err;
    935             }
    936         }
    937 
    938         res = ftruncate(truncfd, attr->st_size);
    939         saverr = res == -1 ? errno : 0;
    940 
    941         if (cap_fsetid_dropped) {
    942             if (gain_effective_cap("FSETID")) {
    943                 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
    944             }
    945         }
    946         if (!fi) {
    947             close(truncfd);
    948         }
    949         if (res == -1) {
    950             goto out_err;
    951         }
    952     }
    953     if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
    954         struct timespec tv[2];
    955 
    956         tv[0].tv_sec = 0;
    957         tv[1].tv_sec = 0;
    958         tv[0].tv_nsec = UTIME_OMIT;
    959         tv[1].tv_nsec = UTIME_OMIT;
    960 
    961         if (valid & FUSE_SET_ATTR_ATIME_NOW) {
    962             tv[0].tv_nsec = UTIME_NOW;
    963         } else if (valid & FUSE_SET_ATTR_ATIME) {
    964             tv[0] = attr->st_atim;
    965         }
    966 
    967         if (valid & FUSE_SET_ATTR_MTIME_NOW) {
    968             tv[1].tv_nsec = UTIME_NOW;
    969         } else if (valid & FUSE_SET_ATTR_MTIME) {
    970             tv[1] = attr->st_mtim;
    971         }
    972 
    973         if (fi) {
    974             res = futimens(fd, tv);
    975         } else {
    976             sprintf(procname, "%i", inode->fd);
    977             res = utimensat(lo->proc_self_fd, procname, tv, 0);
    978         }
    979         if (res == -1) {
    980             saverr = errno;
    981             goto out_err;
    982         }
    983     }
    984     lo_inode_put(lo, &inode);
    985 
    986     return lo_getattr(req, ino, fi);
    987 
    988 out_err:
    989     lo_inode_put(lo, &inode);
    990     fuse_reply_err(req, saverr);
    991 }
    992 
    993 static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
    994                                 uint64_t mnt_id)
    995 {
    996     struct lo_inode *p;
    997     struct lo_key key = {
    998         .ino = st->st_ino,
    999         .dev = st->st_dev,
   1000         .mnt_id = mnt_id,
   1001     };
   1002 
   1003     pthread_mutex_lock(&lo->mutex);
   1004     p = g_hash_table_lookup(lo->inodes, &key);
   1005     if (p) {
   1006         assert(p->nlookup > 0);
   1007         p->nlookup++;
   1008         g_atomic_int_inc(&p->refcount);
   1009     }
   1010     pthread_mutex_unlock(&lo->mutex);
   1011 
   1012     return p;
   1013 }
   1014 
   1015 /* value_destroy_func for posix_locks GHashTable */
   1016 static void posix_locks_value_destroy(gpointer data)
   1017 {
   1018     struct lo_inode_plock *plock = data;
   1019 
   1020     /*
   1021      * We had used open() for locks and had only one fd. So
   1022      * closing this fd should release all OFD locks.
   1023      */
   1024     close(plock->fd);
   1025     free(plock);
   1026 }
   1027 
   1028 static int do_statx(struct lo_data *lo, int dirfd, const char *pathname,
   1029                     struct stat *statbuf, int flags, uint64_t *mnt_id)
   1030 {
   1031     int res;
   1032 
   1033 #if defined(CONFIG_STATX) && defined(CONFIG_STATX_MNT_ID)
   1034     if (lo->use_statx) {
   1035         struct statx statxbuf;
   1036 
   1037         res = statx(dirfd, pathname, flags, STATX_BASIC_STATS | STATX_MNT_ID,
   1038                     &statxbuf);
   1039         if (!res) {
   1040             memset(statbuf, 0, sizeof(*statbuf));
   1041             statbuf->st_dev = makedev(statxbuf.stx_dev_major,
   1042                                       statxbuf.stx_dev_minor);
   1043             statbuf->st_ino = statxbuf.stx_ino;
   1044             statbuf->st_mode = statxbuf.stx_mode;
   1045             statbuf->st_nlink = statxbuf.stx_nlink;
   1046             statbuf->st_uid = statxbuf.stx_uid;
   1047             statbuf->st_gid = statxbuf.stx_gid;
   1048             statbuf->st_rdev = makedev(statxbuf.stx_rdev_major,
   1049                                        statxbuf.stx_rdev_minor);
   1050             statbuf->st_size = statxbuf.stx_size;
   1051             statbuf->st_blksize = statxbuf.stx_blksize;
   1052             statbuf->st_blocks = statxbuf.stx_blocks;
   1053             statbuf->st_atim.tv_sec = statxbuf.stx_atime.tv_sec;
   1054             statbuf->st_atim.tv_nsec = statxbuf.stx_atime.tv_nsec;
   1055             statbuf->st_mtim.tv_sec = statxbuf.stx_mtime.tv_sec;
   1056             statbuf->st_mtim.tv_nsec = statxbuf.stx_mtime.tv_nsec;
   1057             statbuf->st_ctim.tv_sec = statxbuf.stx_ctime.tv_sec;
   1058             statbuf->st_ctim.tv_nsec = statxbuf.stx_ctime.tv_nsec;
   1059 
   1060             if (statxbuf.stx_mask & STATX_MNT_ID) {
   1061                 *mnt_id = statxbuf.stx_mnt_id;
   1062             } else {
   1063                 *mnt_id = 0;
   1064             }
   1065             return 0;
   1066         } else if (errno != ENOSYS) {
   1067             return -1;
   1068         }
   1069         lo->use_statx = false;
   1070         /* fallback */
   1071     }
   1072 #endif
   1073     res = fstatat(dirfd, pathname, statbuf, flags);
   1074     if (res == -1) {
   1075         return -1;
   1076     }
   1077     *mnt_id = 0;
   1078 
   1079     return 0;
   1080 }
   1081 
   1082 /*
   1083  * Increments nlookup on the inode on success. unref_inode_lolocked() must be
   1084  * called eventually to decrement nlookup again. If inodep is non-NULL, the
   1085  * inode pointer is stored and the caller must call lo_inode_put().
   1086  */
   1087 static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
   1088                         struct fuse_entry_param *e,
   1089                         struct lo_inode **inodep)
   1090 {
   1091     int newfd;
   1092     int res;
   1093     int saverr;
   1094     uint64_t mnt_id;
   1095     struct lo_data *lo = lo_data(req);
   1096     struct lo_inode *inode = NULL;
   1097     struct lo_inode *dir = lo_inode(req, parent);
   1098 
   1099     if (inodep) {
   1100         *inodep = NULL; /* in case there is an error */
   1101     }
   1102 
   1103     /*
   1104      * name_to_handle_at() and open_by_handle_at() can reach here with fuse
   1105      * mount point in guest, but we don't have its inode info in the
   1106      * ino_map.
   1107      */
   1108     if (!dir) {
   1109         return ENOENT;
   1110     }
   1111 
   1112     memset(e, 0, sizeof(*e));
   1113     e->attr_timeout = lo->timeout;
   1114     e->entry_timeout = lo->timeout;
   1115 
   1116     /* Do not allow escaping root directory */
   1117     if (dir == &lo->root && strcmp(name, "..") == 0) {
   1118         name = ".";
   1119     }
   1120 
   1121     newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
   1122     if (newfd == -1) {
   1123         goto out_err;
   1124     }
   1125 
   1126     res = do_statx(lo, newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
   1127                    &mnt_id);
   1128     if (res == -1) {
   1129         goto out_err;
   1130     }
   1131 
   1132     if (S_ISDIR(e->attr.st_mode) && lo->announce_submounts &&
   1133         (e->attr.st_dev != dir->key.dev || mnt_id != dir->key.mnt_id)) {
   1134         e->attr_flags |= FUSE_ATTR_SUBMOUNT;
   1135     }
   1136 
   1137     inode = lo_find(lo, &e->attr, mnt_id);
   1138     if (inode) {
   1139         close(newfd);
   1140     } else {
   1141         inode = calloc(1, sizeof(struct lo_inode));
   1142         if (!inode) {
   1143             goto out_err;
   1144         }
   1145 
   1146         /* cache only filetype */
   1147         inode->filetype = (e->attr.st_mode & S_IFMT);
   1148 
   1149         /*
   1150          * One for the caller and one for nlookup (released in
   1151          * unref_inode_lolocked())
   1152          */
   1153         g_atomic_int_set(&inode->refcount, 2);
   1154 
   1155         inode->nlookup = 1;
   1156         inode->fd = newfd;
   1157         inode->key.ino = e->attr.st_ino;
   1158         inode->key.dev = e->attr.st_dev;
   1159         inode->key.mnt_id = mnt_id;
   1160         if (lo->posix_lock) {
   1161             pthread_mutex_init(&inode->plock_mutex, NULL);
   1162             inode->posix_locks = g_hash_table_new_full(
   1163                 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
   1164         }
   1165         pthread_mutex_lock(&lo->mutex);
   1166         inode->fuse_ino = lo_add_inode_mapping(req, inode);
   1167         g_hash_table_insert(lo->inodes, &inode->key, inode);
   1168         pthread_mutex_unlock(&lo->mutex);
   1169     }
   1170     e->ino = inode->fuse_ino;
   1171 
   1172     /* Transfer ownership of inode pointer to caller or drop it */
   1173     if (inodep) {
   1174         *inodep = inode;
   1175     } else {
   1176         lo_inode_put(lo, &inode);
   1177     }
   1178 
   1179     lo_inode_put(lo, &dir);
   1180 
   1181     fuse_log(FUSE_LOG_DEBUG, "  %lli/%s -> %lli\n", (unsigned long long)parent,
   1182              name, (unsigned long long)e->ino);
   1183 
   1184     return 0;
   1185 
   1186 out_err:
   1187     saverr = errno;
   1188     if (newfd != -1) {
   1189         close(newfd);
   1190     }
   1191     lo_inode_put(lo, &inode);
   1192     lo_inode_put(lo, &dir);
   1193     return saverr;
   1194 }
   1195 
   1196 static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
   1197 {
   1198     struct fuse_entry_param e;
   1199     int err;
   1200 
   1201     fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
   1202              name);
   1203 
   1204     if (is_empty(name)) {
   1205         fuse_reply_err(req, ENOENT);
   1206         return;
   1207     }
   1208 
   1209     /*
   1210      * Don't use is_safe_path_component(), allow "." and ".." for NFS export
   1211      * support.
   1212      */
   1213     if (strchr(name, '/')) {
   1214         fuse_reply_err(req, EINVAL);
   1215         return;
   1216     }
   1217 
   1218     err = lo_do_lookup(req, parent, name, &e, NULL);
   1219     if (err) {
   1220         fuse_reply_err(req, err);
   1221     } else {
   1222         fuse_reply_entry(req, &e);
   1223     }
   1224 }
   1225 
   1226 /*
   1227  * On some archs, setres*id is limited to 2^16 but they
   1228  * provide setres*id32 variants that allow 2^32.
   1229  * Others just let setres*id do 2^32 anyway.
   1230  */
   1231 #ifdef SYS_setresgid32
   1232 #define OURSYS_setresgid SYS_setresgid32
   1233 #else
   1234 #define OURSYS_setresgid SYS_setresgid
   1235 #endif
   1236 
   1237 #ifdef SYS_setresuid32
   1238 #define OURSYS_setresuid SYS_setresuid32
   1239 #else
   1240 #define OURSYS_setresuid SYS_setresuid
   1241 #endif
   1242 
   1243 static void drop_supplementary_groups(void)
   1244 {
   1245     int ret;
   1246 
   1247     ret = getgroups(0, NULL);
   1248     if (ret == -1) {
   1249         fuse_log(FUSE_LOG_ERR, "getgroups() failed with error=%d:%s\n",
   1250                  errno, strerror(errno));
   1251         exit(1);
   1252     }
   1253 
   1254     if (!ret) {
   1255         return;
   1256     }
   1257 
   1258     /* Drop all supplementary groups. We should not need it */
   1259     ret = setgroups(0, NULL);
   1260     if (ret == -1) {
   1261         fuse_log(FUSE_LOG_ERR, "setgroups() failed with error=%d:%s\n",
   1262                  errno, strerror(errno));
   1263         exit(1);
   1264     }
   1265 }
   1266 
   1267 /*
   1268  * Change to uid/gid of caller so that file is created with
   1269  * ownership of caller.
   1270  * TODO: What about selinux context?
   1271  */
   1272 static int lo_change_cred(fuse_req_t req, struct lo_cred *old,
   1273                           bool change_umask)
   1274 {
   1275     int res;
   1276 
   1277     old->euid = geteuid();
   1278     old->egid = getegid();
   1279 
   1280     res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
   1281     if (res == -1) {
   1282         return errno;
   1283     }
   1284 
   1285     res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
   1286     if (res == -1) {
   1287         int errno_save = errno;
   1288 
   1289         syscall(OURSYS_setresgid, -1, old->egid, -1);
   1290         return errno_save;
   1291     }
   1292 
   1293     if (change_umask) {
   1294         old->umask = umask(req->ctx.umask);
   1295     }
   1296     return 0;
   1297 }
   1298 
   1299 /* Regain Privileges */
   1300 static void lo_restore_cred(struct lo_cred *old, bool restore_umask)
   1301 {
   1302     int res;
   1303 
   1304     res = syscall(OURSYS_setresuid, -1, old->euid, -1);
   1305     if (res == -1) {
   1306         fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
   1307         exit(1);
   1308     }
   1309 
   1310     res = syscall(OURSYS_setresgid, -1, old->egid, -1);
   1311     if (res == -1) {
   1312         fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
   1313         exit(1);
   1314     }
   1315 
   1316     if (restore_umask)
   1317         umask(old->umask);
   1318 }
   1319 
   1320 /*
   1321  * A helper to change cred and drop capability. Returns 0 on success and
   1322  * errno on error
   1323  */
   1324 static int lo_drop_cap_change_cred(fuse_req_t req, struct lo_cred *old,
   1325                                    bool change_umask, const char *cap_name,
   1326                                    bool *cap_dropped)
   1327 {
   1328     int ret;
   1329     bool __cap_dropped;
   1330 
   1331     assert(cap_name);
   1332 
   1333     ret = drop_effective_cap(cap_name, &__cap_dropped);
   1334     if (ret) {
   1335         return ret;
   1336     }
   1337 
   1338     ret = lo_change_cred(req, old, change_umask);
   1339     if (ret) {
   1340         if (__cap_dropped) {
   1341             if (gain_effective_cap(cap_name)) {
   1342                 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name);
   1343             }
   1344         }
   1345     }
   1346 
   1347     if (cap_dropped) {
   1348         *cap_dropped = __cap_dropped;
   1349     }
   1350     return ret;
   1351 }
   1352 
   1353 static void lo_restore_cred_gain_cap(struct lo_cred *old, bool restore_umask,
   1354                                      const char *cap_name)
   1355 {
   1356     assert(cap_name);
   1357 
   1358     lo_restore_cred(old, restore_umask);
   1359 
   1360     if (gain_effective_cap(cap_name)) {
   1361         fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name);
   1362     }
   1363 }
   1364 
   1365 static int do_mknod_symlink_secctx(fuse_req_t req, struct lo_inode *dir,
   1366                                    const char *name, const char *secctx_name)
   1367 {
   1368     int path_fd, err;
   1369     char procname[64];
   1370     struct lo_data *lo = lo_data(req);
   1371 
   1372     if (!req->secctx.ctxlen) {
   1373         return 0;
   1374     }
   1375 
   1376     /* Open newly created element with O_PATH */
   1377     path_fd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
   1378     err = path_fd == -1 ? errno : 0;
   1379     if (err) {
   1380         return err;
   1381     }
   1382     sprintf(procname, "%i", path_fd);
   1383     FCHDIR_NOFAIL(lo->proc_self_fd);
   1384     /* Set security context. This is not atomic w.r.t file creation */
   1385     err = setxattr(procname, secctx_name, req->secctx.ctx, req->secctx.ctxlen,
   1386                    0);
   1387     if (err) {
   1388         err = errno;
   1389     }
   1390     FCHDIR_NOFAIL(lo->root.fd);
   1391     close(path_fd);
   1392     return err;
   1393 }
   1394 
   1395 static int do_mknod_symlink(fuse_req_t req, struct lo_inode *dir,
   1396                             const char *name, mode_t mode, dev_t rdev,
   1397                             const char *link)
   1398 {
   1399     int err, fscreate_fd = -1;
   1400     const char *secctx_name = req->secctx.name;
   1401     struct lo_cred old = {};
   1402     struct lo_data *lo = lo_data(req);
   1403     char *mapped_name = NULL;
   1404     bool secctx_enabled = req->secctx.ctxlen;
   1405     bool do_fscreate = false;
   1406 
   1407     if (secctx_enabled && lo->xattrmap) {
   1408         err = xattr_map_client(lo, req->secctx.name, &mapped_name);
   1409         if (err < 0) {
   1410             return -err;
   1411         }
   1412         secctx_name = mapped_name;
   1413     }
   1414 
   1415     /*
   1416      * If security xattr has not been remapped and selinux is enabled on
   1417      * host, set fscreate and no need to do a setxattr() after file creation
   1418      */
   1419     if (secctx_enabled && !mapped_name && lo->use_fscreate) {
   1420         do_fscreate = true;
   1421         err = open_set_proc_fscreate(lo, req->secctx.ctx, req->secctx.ctxlen,
   1422                                      &fscreate_fd);
   1423         if (err) {
   1424             goto out;
   1425         }
   1426     }
   1427 
   1428     err = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode));
   1429     if (err) {
   1430         goto out;
   1431     }
   1432 
   1433     err = mknod_wrapper(dir->fd, name, link, mode, rdev);
   1434     err = err == -1 ? errno : 0;
   1435     lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode));
   1436     if (err) {
   1437         goto out;
   1438     }
   1439 
   1440     if (!do_fscreate) {
   1441         err = do_mknod_symlink_secctx(req, dir, name, secctx_name);
   1442         if (err) {
   1443             unlinkat(dir->fd, name, S_ISDIR(mode) ? AT_REMOVEDIR : 0);
   1444         }
   1445     }
   1446 out:
   1447     if (fscreate_fd != -1) {
   1448         close_reset_proc_fscreate(fscreate_fd);
   1449     }
   1450     g_free(mapped_name);
   1451     return err;
   1452 }
   1453 
   1454 static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
   1455                              const char *name, mode_t mode, dev_t rdev,
   1456                              const char *link)
   1457 {
   1458     int saverr;
   1459     struct lo_data *lo = lo_data(req);
   1460     struct lo_inode *dir;
   1461     struct fuse_entry_param e;
   1462 
   1463     if (is_empty(name)) {
   1464         fuse_reply_err(req, ENOENT);
   1465         return;
   1466     }
   1467 
   1468     if (!is_safe_path_component(name)) {
   1469         fuse_reply_err(req, EINVAL);
   1470         return;
   1471     }
   1472 
   1473     dir = lo_inode(req, parent);
   1474     if (!dir) {
   1475         fuse_reply_err(req, EBADF);
   1476         return;
   1477     }
   1478 
   1479     saverr = do_mknod_symlink(req, dir, name, mode, rdev, link);
   1480     if (saverr) {
   1481         goto out;
   1482     }
   1483 
   1484     saverr = lo_do_lookup(req, parent, name, &e, NULL);
   1485     if (saverr) {
   1486         goto out;
   1487     }
   1488 
   1489     fuse_log(FUSE_LOG_DEBUG, "  %lli/%s -> %lli\n", (unsigned long long)parent,
   1490              name, (unsigned long long)e.ino);
   1491 
   1492     fuse_reply_entry(req, &e);
   1493     lo_inode_put(lo, &dir);
   1494     return;
   1495 
   1496 out:
   1497     lo_inode_put(lo, &dir);
   1498     fuse_reply_err(req, saverr);
   1499 }
   1500 
   1501 static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
   1502                      mode_t mode, dev_t rdev)
   1503 {
   1504     lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
   1505 }
   1506 
   1507 static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
   1508                      mode_t mode)
   1509 {
   1510     lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
   1511 }
   1512 
   1513 static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
   1514                        const char *name)
   1515 {
   1516     lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
   1517 }
   1518 
   1519 static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
   1520                     const char *name)
   1521 {
   1522     int res;
   1523     struct lo_data *lo = lo_data(req);
   1524     struct lo_inode *parent_inode;
   1525     struct lo_inode *inode;
   1526     struct fuse_entry_param e;
   1527     char procname[64];
   1528     int saverr;
   1529 
   1530     if (is_empty(name)) {
   1531         fuse_reply_err(req, ENOENT);
   1532         return;
   1533     }
   1534 
   1535     if (!is_safe_path_component(name)) {
   1536         fuse_reply_err(req, EINVAL);
   1537         return;
   1538     }
   1539 
   1540     parent_inode = lo_inode(req, parent);
   1541     inode = lo_inode(req, ino);
   1542     if (!parent_inode || !inode) {
   1543         errno = EBADF;
   1544         goto out_err;
   1545     }
   1546 
   1547     memset(&e, 0, sizeof(struct fuse_entry_param));
   1548     e.attr_timeout = lo->timeout;
   1549     e.entry_timeout = lo->timeout;
   1550 
   1551     sprintf(procname, "%i", inode->fd);
   1552     res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name,
   1553                  AT_SYMLINK_FOLLOW);
   1554     if (res == -1) {
   1555         goto out_err;
   1556     }
   1557 
   1558     res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
   1559     if (res == -1) {
   1560         goto out_err;
   1561     }
   1562 
   1563     pthread_mutex_lock(&lo->mutex);
   1564     inode->nlookup++;
   1565     pthread_mutex_unlock(&lo->mutex);
   1566     e.ino = inode->fuse_ino;
   1567 
   1568     fuse_log(FUSE_LOG_DEBUG, "  %lli/%s -> %lli\n", (unsigned long long)parent,
   1569              name, (unsigned long long)e.ino);
   1570 
   1571     fuse_reply_entry(req, &e);
   1572     lo_inode_put(lo, &parent_inode);
   1573     lo_inode_put(lo, &inode);
   1574     return;
   1575 
   1576 out_err:
   1577     saverr = errno;
   1578     lo_inode_put(lo, &parent_inode);
   1579     lo_inode_put(lo, &inode);
   1580     fuse_reply_err(req, saverr);
   1581 }
   1582 
   1583 /* Increments nlookup and caller must release refcount using lo_inode_put() */
   1584 static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
   1585                                     const char *name)
   1586 {
   1587     int res;
   1588     uint64_t mnt_id;
   1589     struct stat attr;
   1590     struct lo_data *lo = lo_data(req);
   1591     struct lo_inode *dir = lo_inode(req, parent);
   1592 
   1593     if (!dir) {
   1594         return NULL;
   1595     }
   1596 
   1597     res = do_statx(lo, dir->fd, name, &attr, AT_SYMLINK_NOFOLLOW, &mnt_id);
   1598     lo_inode_put(lo, &dir);
   1599     if (res == -1) {
   1600         return NULL;
   1601     }
   1602 
   1603     return lo_find(lo, &attr, mnt_id);
   1604 }
   1605 
   1606 static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
   1607 {
   1608     int res;
   1609     struct lo_inode *inode;
   1610     struct lo_data *lo = lo_data(req);
   1611 
   1612     if (is_empty(name)) {
   1613         fuse_reply_err(req, ENOENT);
   1614         return;
   1615     }
   1616 
   1617     if (!is_safe_path_component(name)) {
   1618         fuse_reply_err(req, EINVAL);
   1619         return;
   1620     }
   1621 
   1622     inode = lookup_name(req, parent, name);
   1623     if (!inode) {
   1624         fuse_reply_err(req, EIO);
   1625         return;
   1626     }
   1627 
   1628     res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
   1629 
   1630     fuse_reply_err(req, res == -1 ? errno : 0);
   1631     unref_inode_lolocked(lo, inode, 1);
   1632     lo_inode_put(lo, &inode);
   1633 }
   1634 
   1635 static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
   1636                       fuse_ino_t newparent, const char *newname,
   1637                       unsigned int flags)
   1638 {
   1639     int res;
   1640     struct lo_inode *parent_inode;
   1641     struct lo_inode *newparent_inode;
   1642     struct lo_inode *oldinode = NULL;
   1643     struct lo_inode *newinode = NULL;
   1644     struct lo_data *lo = lo_data(req);
   1645 
   1646     if (is_empty(name) || is_empty(newname)) {
   1647         fuse_reply_err(req, ENOENT);
   1648         return;
   1649     }
   1650 
   1651     if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
   1652         fuse_reply_err(req, EINVAL);
   1653         return;
   1654     }
   1655 
   1656     parent_inode = lo_inode(req, parent);
   1657     newparent_inode = lo_inode(req, newparent);
   1658     if (!parent_inode || !newparent_inode) {
   1659         fuse_reply_err(req, EBADF);
   1660         goto out;
   1661     }
   1662 
   1663     oldinode = lookup_name(req, parent, name);
   1664     newinode = lookup_name(req, newparent, newname);
   1665 
   1666     if (!oldinode) {
   1667         fuse_reply_err(req, EIO);
   1668         goto out;
   1669     }
   1670 
   1671     if (flags) {
   1672 #ifndef SYS_renameat2
   1673         fuse_reply_err(req, EINVAL);
   1674 #else
   1675         res = syscall(SYS_renameat2, parent_inode->fd, name,
   1676                         newparent_inode->fd, newname, flags);
   1677         if (res == -1 && errno == ENOSYS) {
   1678             fuse_reply_err(req, EINVAL);
   1679         } else {
   1680             fuse_reply_err(req, res == -1 ? errno : 0);
   1681         }
   1682 #endif
   1683         goto out;
   1684     }
   1685 
   1686     res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
   1687 
   1688     fuse_reply_err(req, res == -1 ? errno : 0);
   1689 out:
   1690     unref_inode_lolocked(lo, oldinode, 1);
   1691     unref_inode_lolocked(lo, newinode, 1);
   1692     lo_inode_put(lo, &oldinode);
   1693     lo_inode_put(lo, &newinode);
   1694     lo_inode_put(lo, &parent_inode);
   1695     lo_inode_put(lo, &newparent_inode);
   1696 }
   1697 
   1698 static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
   1699 {
   1700     int res;
   1701     struct lo_inode *inode;
   1702     struct lo_data *lo = lo_data(req);
   1703 
   1704     if (is_empty(name)) {
   1705         fuse_reply_err(req, ENOENT);
   1706         return;
   1707     }
   1708 
   1709     if (!is_safe_path_component(name)) {
   1710         fuse_reply_err(req, EINVAL);
   1711         return;
   1712     }
   1713 
   1714     inode = lookup_name(req, parent, name);
   1715     if (!inode) {
   1716         fuse_reply_err(req, EIO);
   1717         return;
   1718     }
   1719 
   1720     res = unlinkat(lo_fd(req, parent), name, 0);
   1721 
   1722     fuse_reply_err(req, res == -1 ? errno : 0);
   1723     unref_inode_lolocked(lo, inode, 1);
   1724     lo_inode_put(lo, &inode);
   1725 }
   1726 
   1727 /* To be called with lo->mutex held */
   1728 static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
   1729 {
   1730     if (!inode) {
   1731         return;
   1732     }
   1733 
   1734     assert(inode->nlookup >= n);
   1735     inode->nlookup -= n;
   1736     if (!inode->nlookup) {
   1737         lo_map_remove(&lo->ino_map, inode->fuse_ino);
   1738         g_hash_table_remove(lo->inodes, &inode->key);
   1739         if (lo->posix_lock) {
   1740             if (g_hash_table_size(inode->posix_locks)) {
   1741                 fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
   1742             }
   1743             g_hash_table_destroy(inode->posix_locks);
   1744             pthread_mutex_destroy(&inode->plock_mutex);
   1745         }
   1746         /* Drop our refcount from lo_do_lookup() */
   1747         lo_inode_put(lo, &inode);
   1748     }
   1749 }
   1750 
   1751 static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
   1752                                  uint64_t n)
   1753 {
   1754     if (!inode) {
   1755         return;
   1756     }
   1757 
   1758     pthread_mutex_lock(&lo->mutex);
   1759     unref_inode(lo, inode, n);
   1760     pthread_mutex_unlock(&lo->mutex);
   1761 }
   1762 
   1763 static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
   1764 {
   1765     struct lo_data *lo = lo_data(req);
   1766     struct lo_inode *inode;
   1767 
   1768     inode = lo_inode(req, ino);
   1769     if (!inode) {
   1770         return;
   1771     }
   1772 
   1773     fuse_log(FUSE_LOG_DEBUG, "  forget %lli %lli -%lli\n",
   1774              (unsigned long long)ino, (unsigned long long)inode->nlookup,
   1775              (unsigned long long)nlookup);
   1776 
   1777     unref_inode_lolocked(lo, inode, nlookup);
   1778     lo_inode_put(lo, &inode);
   1779 }
   1780 
   1781 static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
   1782 {
   1783     lo_forget_one(req, ino, nlookup);
   1784     fuse_reply_none(req);
   1785 }
   1786 
   1787 static void lo_forget_multi(fuse_req_t req, size_t count,
   1788                             struct fuse_forget_data *forgets)
   1789 {
   1790     int i;
   1791 
   1792     for (i = 0; i < count; i++) {
   1793         lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
   1794     }
   1795     fuse_reply_none(req);
   1796 }
   1797 
   1798 static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
   1799 {
   1800     char buf[PATH_MAX + 1];
   1801     int res;
   1802 
   1803     res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
   1804     if (res == -1) {
   1805         return (void)fuse_reply_err(req, errno);
   1806     }
   1807 
   1808     if (res == sizeof(buf)) {
   1809         return (void)fuse_reply_err(req, ENAMETOOLONG);
   1810     }
   1811 
   1812     buf[res] = '\0';
   1813 
   1814     fuse_reply_readlink(req, buf);
   1815 }
   1816 
   1817 struct lo_dirp {
   1818     gint refcount;
   1819     DIR *dp;
   1820     struct dirent *entry;
   1821     off_t offset;
   1822 };
   1823 
   1824 static void lo_dirp_put(struct lo_dirp **dp)
   1825 {
   1826     struct lo_dirp *d = *dp;
   1827 
   1828     if (!d) {
   1829         return;
   1830     }
   1831     *dp = NULL;
   1832 
   1833     if (g_atomic_int_dec_and_test(&d->refcount)) {
   1834         closedir(d->dp);
   1835         free(d);
   1836     }
   1837 }
   1838 
   1839 /* Call lo_dirp_put() on the return value when no longer needed */
   1840 static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
   1841 {
   1842     struct lo_data *lo = lo_data(req);
   1843     struct lo_map_elem *elem;
   1844 
   1845     pthread_mutex_lock(&lo->mutex);
   1846     elem = lo_map_get(&lo->dirp_map, fi->fh);
   1847     if (elem) {
   1848         g_atomic_int_inc(&elem->dirp->refcount);
   1849     }
   1850     pthread_mutex_unlock(&lo->mutex);
   1851     if (!elem) {
   1852         return NULL;
   1853     }
   1854 
   1855     return elem->dirp;
   1856 }
   1857 
   1858 static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
   1859                        struct fuse_file_info *fi)
   1860 {
   1861     int error = ENOMEM;
   1862     struct lo_data *lo = lo_data(req);
   1863     struct lo_dirp *d;
   1864     int fd;
   1865     ssize_t fh;
   1866 
   1867     d = calloc(1, sizeof(struct lo_dirp));
   1868     if (d == NULL) {
   1869         goto out_err;
   1870     }
   1871 
   1872     fd = openat(lo_fd(req, ino), ".", O_RDONLY);
   1873     if (fd == -1) {
   1874         goto out_errno;
   1875     }
   1876 
   1877     d->dp = fdopendir(fd);
   1878     if (d->dp == NULL) {
   1879         goto out_errno;
   1880     }
   1881 
   1882     d->offset = 0;
   1883     d->entry = NULL;
   1884 
   1885     g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
   1886     pthread_mutex_lock(&lo->mutex);
   1887     fh = lo_add_dirp_mapping(req, d);
   1888     pthread_mutex_unlock(&lo->mutex);
   1889     if (fh == -1) {
   1890         goto out_err;
   1891     }
   1892 
   1893     fi->fh = fh;
   1894     if (lo->cache == CACHE_ALWAYS) {
   1895         fi->cache_readdir = 1;
   1896     }
   1897     fuse_reply_open(req, fi);
   1898     return;
   1899 
   1900 out_errno:
   1901     error = errno;
   1902 out_err:
   1903     if (d) {
   1904         if (d->dp) {
   1905             closedir(d->dp);
   1906         } else if (fd != -1) {
   1907             close(fd);
   1908         }
   1909         free(d);
   1910     }
   1911     fuse_reply_err(req, error);
   1912 }
   1913 
   1914 static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
   1915                           off_t offset, struct fuse_file_info *fi, int plus)
   1916 {
   1917     struct lo_data *lo = lo_data(req);
   1918     struct lo_dirp *d = NULL;
   1919     struct lo_inode *dinode;
   1920     g_autofree char *buf = NULL;
   1921     char *p;
   1922     size_t rem = size;
   1923     int err = EBADF;
   1924 
   1925     dinode = lo_inode(req, ino);
   1926     if (!dinode) {
   1927         goto error;
   1928     }
   1929 
   1930     d = lo_dirp(req, fi);
   1931     if (!d) {
   1932         goto error;
   1933     }
   1934 
   1935     err = ENOMEM;
   1936     buf = g_try_malloc0(size);
   1937     if (!buf) {
   1938         goto error;
   1939     }
   1940     p = buf;
   1941 
   1942     if (offset != d->offset) {
   1943         seekdir(d->dp, offset);
   1944         d->entry = NULL;
   1945         d->offset = offset;
   1946     }
   1947     while (1) {
   1948         size_t entsize;
   1949         off_t nextoff;
   1950         const char *name;
   1951 
   1952         if (!d->entry) {
   1953             errno = 0;
   1954             d->entry = readdir(d->dp);
   1955             if (!d->entry) {
   1956                 if (errno) { /* Error */
   1957                     err = errno;
   1958                     goto error;
   1959                 } else { /* End of stream */
   1960                     break;
   1961                 }
   1962             }
   1963         }
   1964         nextoff = d->entry->d_off;
   1965         name = d->entry->d_name;
   1966 
   1967         fuse_ino_t entry_ino = 0;
   1968         struct fuse_entry_param e = (struct fuse_entry_param){
   1969             .attr.st_ino = d->entry->d_ino,
   1970             .attr.st_mode = d->entry->d_type << 12,
   1971         };
   1972 
   1973         /* Hide root's parent directory */
   1974         if (dinode == &lo->root && strcmp(name, "..") == 0) {
   1975             e.attr.st_ino = lo->root.key.ino;
   1976             e.attr.st_mode = DT_DIR << 12;
   1977         }
   1978 
   1979         if (plus) {
   1980             if (!is_dot_or_dotdot(name)) {
   1981                 err = lo_do_lookup(req, ino, name, &e, NULL);
   1982                 if (err) {
   1983                     goto error;
   1984                 }
   1985                 entry_ino = e.ino;
   1986             }
   1987 
   1988             entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
   1989         } else {
   1990             entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
   1991         }
   1992         if (entsize > rem) {
   1993             if (entry_ino != 0) {
   1994                 lo_forget_one(req, entry_ino, 1);
   1995             }
   1996             break;
   1997         }
   1998 
   1999         p += entsize;
   2000         rem -= entsize;
   2001 
   2002         d->entry = NULL;
   2003         d->offset = nextoff;
   2004     }
   2005 
   2006     err = 0;
   2007 error:
   2008     lo_dirp_put(&d);
   2009     lo_inode_put(lo, &dinode);
   2010 
   2011     /*
   2012      * If there's an error, we can only signal it if we haven't stored
   2013      * any entries yet - otherwise we'd end up with wrong lookup
   2014      * counts for the entries that are already in the buffer. So we
   2015      * return what we've collected until that point.
   2016      */
   2017     if (err && rem == size) {
   2018         fuse_reply_err(req, err);
   2019     } else {
   2020         fuse_reply_buf(req, buf, size - rem);
   2021     }
   2022 }
   2023 
   2024 static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
   2025                        off_t offset, struct fuse_file_info *fi)
   2026 {
   2027     lo_do_readdir(req, ino, size, offset, fi, 0);
   2028 }
   2029 
   2030 static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
   2031                            off_t offset, struct fuse_file_info *fi)
   2032 {
   2033     lo_do_readdir(req, ino, size, offset, fi, 1);
   2034 }
   2035 
   2036 static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
   2037                           struct fuse_file_info *fi)
   2038 {
   2039     struct lo_data *lo = lo_data(req);
   2040     struct lo_map_elem *elem;
   2041     struct lo_dirp *d;
   2042 
   2043     (void)ino;
   2044 
   2045     pthread_mutex_lock(&lo->mutex);
   2046     elem = lo_map_get(&lo->dirp_map, fi->fh);
   2047     if (!elem) {
   2048         pthread_mutex_unlock(&lo->mutex);
   2049         fuse_reply_err(req, EBADF);
   2050         return;
   2051     }
   2052 
   2053     d = elem->dirp;
   2054     lo_map_remove(&lo->dirp_map, fi->fh);
   2055     pthread_mutex_unlock(&lo->mutex);
   2056 
   2057     lo_dirp_put(&d); /* paired with lo_opendir() */
   2058 
   2059     fuse_reply_err(req, 0);
   2060 }
   2061 
   2062 static void update_open_flags(int writeback, int allow_direct_io,
   2063                               struct fuse_file_info *fi)
   2064 {
   2065     /*
   2066      * With writeback cache, kernel may send read requests even
   2067      * when userspace opened write-only
   2068      */
   2069     if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
   2070         fi->flags &= ~O_ACCMODE;
   2071         fi->flags |= O_RDWR;
   2072     }
   2073 
   2074     /*
   2075      * With writeback cache, O_APPEND is handled by the kernel.
   2076      * This breaks atomicity (since the file may change in the
   2077      * underlying filesystem, so that the kernel's idea of the
   2078      * end of the file isn't accurate anymore). In this example,
   2079      * we just accept that. A more rigorous filesystem may want
   2080      * to return an error here
   2081      */
   2082     if (writeback && (fi->flags & O_APPEND)) {
   2083         fi->flags &= ~O_APPEND;
   2084     }
   2085 
   2086     /*
   2087      * O_DIRECT in guest should not necessarily mean bypassing page
   2088      * cache on host as well. Therefore, we discard it by default
   2089      * ('-o no_allow_direct_io'). If somebody needs that behavior,
   2090      * the '-o allow_direct_io' option should be set.
   2091      */
   2092     if (!allow_direct_io) {
   2093         fi->flags &= ~O_DIRECT;
   2094     }
   2095 }
   2096 
   2097 /*
   2098  * Open a regular file, set up an fd mapping, and fill out the struct
   2099  * fuse_file_info for it. If existing_fd is not negative, use that fd instead
   2100  * opening a new one. Takes ownership of existing_fd.
   2101  *
   2102  * Returns 0 on success or a positive errno.
   2103  */
   2104 static int lo_do_open(struct lo_data *lo, struct lo_inode *inode,
   2105                       int existing_fd, struct fuse_file_info *fi)
   2106 {
   2107     ssize_t fh;
   2108     int fd = existing_fd;
   2109     int err;
   2110     bool cap_fsetid_dropped = false;
   2111     bool kill_suidgid = lo->killpriv_v2 && fi->kill_priv;
   2112 
   2113     update_open_flags(lo->writeback, lo->allow_direct_io, fi);
   2114 
   2115     if (fd < 0) {
   2116         if (kill_suidgid) {
   2117             err = drop_effective_cap("FSETID", &cap_fsetid_dropped);
   2118             if (err) {
   2119                 return err;
   2120             }
   2121         }
   2122 
   2123         fd = lo_inode_open(lo, inode, fi->flags);
   2124 
   2125         if (cap_fsetid_dropped) {
   2126             if (gain_effective_cap("FSETID")) {
   2127                 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
   2128             }
   2129         }
   2130         if (fd < 0) {
   2131             return -fd;
   2132         }
   2133         if (fi->flags & (O_TRUNC)) {
   2134             int err = drop_security_capability(lo, fd);
   2135             if (err) {
   2136                 close(fd);
   2137                 return err;
   2138             }
   2139         }
   2140     }
   2141 
   2142     pthread_mutex_lock(&lo->mutex);
   2143     fh = lo_add_fd_mapping(lo, fd);
   2144     pthread_mutex_unlock(&lo->mutex);
   2145     if (fh == -1) {
   2146         close(fd);
   2147         return ENOMEM;
   2148     }
   2149 
   2150     fi->fh = fh;
   2151     if (lo->cache == CACHE_NONE) {
   2152         fi->direct_io = 1;
   2153     } else if (lo->cache == CACHE_ALWAYS) {
   2154         fi->keep_cache = 1;
   2155     }
   2156     return 0;
   2157 }
   2158 
   2159 static int do_create_nosecctx(fuse_req_t req, struct lo_inode *parent_inode,
   2160                                const char *name, mode_t mode,
   2161                                struct fuse_file_info *fi, int *open_fd,
   2162                               bool tmpfile)
   2163 {
   2164     int err, fd;
   2165     struct lo_cred old = {};
   2166     struct lo_data *lo = lo_data(req);
   2167     int flags;
   2168 
   2169     if (tmpfile) {
   2170         flags = fi->flags | O_TMPFILE;
   2171         /*
   2172          * Don't use O_EXCL as we want to link file later. Also reset O_CREAT
   2173          * otherwise openat() returns -EINVAL.
   2174          */
   2175         flags &= ~(O_CREAT | O_EXCL);
   2176 
   2177         /* O_TMPFILE needs either O_RDWR or O_WRONLY */
   2178         if ((flags & O_ACCMODE) == O_RDONLY) {
   2179             flags |= O_RDWR;
   2180         }
   2181     } else {
   2182         flags = fi->flags | O_CREAT | O_EXCL;
   2183     }
   2184 
   2185     err = lo_change_cred(req, &old, lo->change_umask);
   2186     if (err) {
   2187         return err;
   2188     }
   2189 
   2190     /* Try to create a new file but don't open existing files */
   2191     fd = openat(parent_inode->fd, name, flags, mode);
   2192     err = fd == -1 ? errno : 0;
   2193     lo_restore_cred(&old, lo->change_umask);
   2194     if (!err) {
   2195         *open_fd = fd;
   2196     }
   2197     return err;
   2198 }
   2199 
   2200 static int do_create_secctx_fscreate(fuse_req_t req,
   2201                                      struct lo_inode *parent_inode,
   2202                                      const char *name, mode_t mode,
   2203                                      struct fuse_file_info *fi, int *open_fd)
   2204 {
   2205     int err = 0, fd = -1, fscreate_fd = -1;
   2206     struct lo_data *lo = lo_data(req);
   2207 
   2208     err = open_set_proc_fscreate(lo, req->secctx.ctx, req->secctx.ctxlen,
   2209                                  &fscreate_fd);
   2210     if (err) {
   2211         return err;
   2212     }
   2213 
   2214     err = do_create_nosecctx(req, parent_inode, name, mode, fi, &fd, false);
   2215 
   2216     close_reset_proc_fscreate(fscreate_fd);
   2217     if (!err) {
   2218         *open_fd = fd;
   2219     }
   2220     return err;
   2221 }
   2222 
   2223 static int do_create_secctx_tmpfile(fuse_req_t req,
   2224                                     struct lo_inode *parent_inode,
   2225                                     const char *name, mode_t mode,
   2226                                     struct fuse_file_info *fi,
   2227                                     const char *secctx_name, int *open_fd)
   2228 {
   2229     int err, fd = -1;
   2230     struct lo_data *lo = lo_data(req);
   2231     char procname[64];
   2232 
   2233     err = do_create_nosecctx(req, parent_inode, ".", mode, fi, &fd, true);
   2234     if (err) {
   2235         return err;
   2236     }
   2237 
   2238     err = fsetxattr(fd, secctx_name, req->secctx.ctx, req->secctx.ctxlen, 0);
   2239     if (err) {
   2240         err = errno;
   2241         goto out;
   2242     }
   2243 
   2244     /* Security context set on file. Link it in place */
   2245     sprintf(procname, "%d", fd);
   2246     FCHDIR_NOFAIL(lo->proc_self_fd);
   2247     err = linkat(AT_FDCWD, procname, parent_inode->fd, name,
   2248                  AT_SYMLINK_FOLLOW);
   2249     err = err == -1 ? errno : 0;
   2250     FCHDIR_NOFAIL(lo->root.fd);
   2251 
   2252 out:
   2253     if (!err) {
   2254         *open_fd = fd;
   2255     } else if (fd != -1) {
   2256         close(fd);
   2257     }
   2258     return err;
   2259 }
   2260 
   2261 static int do_create_secctx_noatomic(fuse_req_t req,
   2262                                      struct lo_inode *parent_inode,
   2263                                      const char *name, mode_t mode,
   2264                                      struct fuse_file_info *fi,
   2265                                      const char *secctx_name, int *open_fd)
   2266 {
   2267     int err = 0, fd = -1;
   2268 
   2269     err = do_create_nosecctx(req, parent_inode, name, mode, fi, &fd, false);
   2270     if (err) {
   2271         goto out;
   2272     }
   2273 
   2274     /* Set security context. This is not atomic w.r.t file creation */
   2275     err = fsetxattr(fd, secctx_name, req->secctx.ctx, req->secctx.ctxlen, 0);
   2276     err = err == -1 ? errno : 0;
   2277 out:
   2278     if (!err) {
   2279         *open_fd = fd;
   2280     } else {
   2281         if (fd != -1) {
   2282             close(fd);
   2283             unlinkat(parent_inode->fd, name, 0);
   2284         }
   2285     }
   2286     return err;
   2287 }
   2288 
   2289 static int do_lo_create(fuse_req_t req, struct lo_inode *parent_inode,
   2290                         const char *name, mode_t mode,
   2291                         struct fuse_file_info *fi, int *open_fd)
   2292 {
   2293     struct lo_data *lo = lo_data(req);
   2294     char *mapped_name = NULL;
   2295     int err;
   2296     const char *ctxname = req->secctx.name;
   2297     bool secctx_enabled = req->secctx.ctxlen;
   2298 
   2299     if (secctx_enabled && lo->xattrmap) {
   2300         err = xattr_map_client(lo, req->secctx.name, &mapped_name);
   2301         if (err < 0) {
   2302             return -err;
   2303         }
   2304 
   2305         ctxname = mapped_name;
   2306     }
   2307 
   2308     if (secctx_enabled) {
   2309         /*
   2310          * If security.selinux has not been remapped and selinux is enabled,
   2311          * use fscreate to set context before file creation. If not, use
   2312          * tmpfile method for regular files. Otherwise fallback to
   2313          * non-atomic method of file creation and xattr setting.
   2314          */
   2315         if (!mapped_name && lo->use_fscreate) {
   2316             err = do_create_secctx_fscreate(req, parent_inode, name, mode, fi,
   2317                                             open_fd);
   2318             goto out;
   2319         } else if (S_ISREG(mode)) {
   2320             err = do_create_secctx_tmpfile(req, parent_inode, name, mode, fi,
   2321                                            ctxname, open_fd);
   2322             /*
   2323              * If filesystem does not support O_TMPFILE, fallback to non-atomic
   2324              * method.
   2325              */
   2326             if (!err || err != EOPNOTSUPP) {
   2327                 goto out;
   2328             }
   2329         }
   2330 
   2331         err = do_create_secctx_noatomic(req, parent_inode, name, mode, fi,
   2332                                         ctxname, open_fd);
   2333     } else {
   2334         err = do_create_nosecctx(req, parent_inode, name, mode, fi, open_fd,
   2335                                  false);
   2336     }
   2337 
   2338 out:
   2339     g_free(mapped_name);
   2340     return err;
   2341 }
   2342 
   2343 static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
   2344                       mode_t mode, struct fuse_file_info *fi)
   2345 {
   2346     int fd = -1;
   2347     struct lo_data *lo = lo_data(req);
   2348     struct lo_inode *parent_inode;
   2349     struct lo_inode *inode = NULL;
   2350     struct fuse_entry_param e;
   2351     int err;
   2352 
   2353     fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)"
   2354              " kill_priv=%d\n", parent, name, fi->kill_priv);
   2355 
   2356     if (!is_safe_path_component(name)) {
   2357         fuse_reply_err(req, EINVAL);
   2358         return;
   2359     }
   2360 
   2361     parent_inode = lo_inode(req, parent);
   2362     if (!parent_inode) {
   2363         fuse_reply_err(req, EBADF);
   2364         return;
   2365     }
   2366 
   2367     update_open_flags(lo->writeback, lo->allow_direct_io, fi);
   2368 
   2369     err = do_lo_create(req, parent_inode, name, mode, fi, &fd);
   2370 
   2371     /* Ignore the error if file exists and O_EXCL was not given */
   2372     if (err && (err != EEXIST || (fi->flags & O_EXCL))) {
   2373         goto out;
   2374     }
   2375 
   2376     err = lo_do_lookup(req, parent, name, &e, &inode);
   2377     if (err) {
   2378         goto out;
   2379     }
   2380 
   2381     err = lo_do_open(lo, inode, fd, fi);
   2382     fd = -1; /* lo_do_open() takes ownership of fd */
   2383     if (err) {
   2384         /* Undo lo_do_lookup() nlookup ref */
   2385         unref_inode_lolocked(lo, inode, 1);
   2386     }
   2387 
   2388 out:
   2389     lo_inode_put(lo, &inode);
   2390     lo_inode_put(lo, &parent_inode);
   2391 
   2392     if (err) {
   2393         if (fd >= 0) {
   2394             close(fd);
   2395         }
   2396 
   2397         fuse_reply_err(req, err);
   2398     } else {
   2399         fuse_reply_create(req, &e, fi);
   2400     }
   2401 }
   2402 
   2403 /* Should be called with inode->plock_mutex held */
   2404 static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
   2405                                                       struct lo_inode *inode,
   2406                                                       uint64_t lock_owner,
   2407                                                       pid_t pid, int *err)
   2408 {
   2409     struct lo_inode_plock *plock;
   2410     int fd;
   2411 
   2412     plock =
   2413         g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
   2414 
   2415     if (plock) {
   2416         return plock;
   2417     }
   2418 
   2419     plock = malloc(sizeof(struct lo_inode_plock));
   2420     if (!plock) {
   2421         *err = ENOMEM;
   2422         return NULL;
   2423     }
   2424 
   2425     /* Open another instance of file which can be used for ofd locks. */
   2426     /* TODO: What if file is not writable? */
   2427     fd = lo_inode_open(lo, inode, O_RDWR);
   2428     if (fd < 0) {
   2429         *err = -fd;
   2430         free(plock);
   2431         return NULL;
   2432     }
   2433 
   2434     plock->lock_owner = lock_owner;
   2435     plock->fd = fd;
   2436     g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
   2437                         plock);
   2438     return plock;
   2439 }
   2440 
   2441 static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
   2442                      struct flock *lock)
   2443 {
   2444     struct lo_data *lo = lo_data(req);
   2445     struct lo_inode *inode;
   2446     struct lo_inode_plock *plock;
   2447     int ret, saverr = 0;
   2448 
   2449     fuse_log(FUSE_LOG_DEBUG,
   2450              "lo_getlk(ino=%" PRIu64 ", flags=%d)"
   2451              " owner=0x%" PRIx64 ", l_type=%d l_start=0x%" PRIx64
   2452              " l_len=0x%" PRIx64 "\n",
   2453              ino, fi->flags, fi->lock_owner, lock->l_type,
   2454              (uint64_t)lock->l_start, (uint64_t)lock->l_len);
   2455 
   2456     if (!lo->posix_lock) {
   2457         fuse_reply_err(req, ENOSYS);
   2458         return;
   2459     }
   2460 
   2461     inode = lo_inode(req, ino);
   2462     if (!inode) {
   2463         fuse_reply_err(req, EBADF);
   2464         return;
   2465     }
   2466 
   2467     pthread_mutex_lock(&inode->plock_mutex);
   2468     plock =
   2469         lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
   2470     if (!plock) {
   2471         saverr = ret;
   2472         goto out;
   2473     }
   2474 
   2475     ret = fcntl(plock->fd, F_OFD_GETLK, lock);
   2476     if (ret == -1) {
   2477         saverr = errno;
   2478     }
   2479 
   2480 out:
   2481     pthread_mutex_unlock(&inode->plock_mutex);
   2482     lo_inode_put(lo, &inode);
   2483 
   2484     if (saverr) {
   2485         fuse_reply_err(req, saverr);
   2486     } else {
   2487         fuse_reply_lock(req, lock);
   2488     }
   2489 }
   2490 
   2491 static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
   2492                      struct flock *lock, int sleep)
   2493 {
   2494     struct lo_data *lo = lo_data(req);
   2495     struct lo_inode *inode;
   2496     struct lo_inode_plock *plock;
   2497     int ret, saverr = 0;
   2498 
   2499     fuse_log(FUSE_LOG_DEBUG,
   2500              "lo_setlk(ino=%" PRIu64 ", flags=%d)"
   2501              " cmd=%d pid=%d owner=0x%" PRIx64 " sleep=%d l_whence=%d"
   2502              " l_start=0x%" PRIx64 " l_len=0x%" PRIx64 "\n",
   2503              ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
   2504              lock->l_whence, (uint64_t)lock->l_start, (uint64_t)lock->l_len);
   2505 
   2506     if (!lo->posix_lock) {
   2507         fuse_reply_err(req, ENOSYS);
   2508         return;
   2509     }
   2510 
   2511     if (sleep) {
   2512         fuse_reply_err(req, EOPNOTSUPP);
   2513         return;
   2514     }
   2515 
   2516     inode = lo_inode(req, ino);
   2517     if (!inode) {
   2518         fuse_reply_err(req, EBADF);
   2519         return;
   2520     }
   2521 
   2522     pthread_mutex_lock(&inode->plock_mutex);
   2523     plock =
   2524         lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
   2525 
   2526     if (!plock) {
   2527         saverr = ret;
   2528         goto out;
   2529     }
   2530 
   2531     /* TODO: Is it alright to modify flock? */
   2532     lock->l_pid = 0;
   2533     ret = fcntl(plock->fd, F_OFD_SETLK, lock);
   2534     if (ret == -1) {
   2535         saverr = errno;
   2536     }
   2537 
   2538 out:
   2539     pthread_mutex_unlock(&inode->plock_mutex);
   2540     lo_inode_put(lo, &inode);
   2541 
   2542     fuse_reply_err(req, saverr);
   2543 }
   2544 
   2545 static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
   2546                         struct fuse_file_info *fi)
   2547 {
   2548     int res;
   2549     struct lo_dirp *d;
   2550     int fd;
   2551 
   2552     (void)ino;
   2553 
   2554     d = lo_dirp(req, fi);
   2555     if (!d) {
   2556         fuse_reply_err(req, EBADF);
   2557         return;
   2558     }
   2559 
   2560     fd = dirfd(d->dp);
   2561     if (datasync) {
   2562         res = fdatasync(fd);
   2563     } else {
   2564         res = fsync(fd);
   2565     }
   2566 
   2567     lo_dirp_put(&d);
   2568 
   2569     fuse_reply_err(req, res == -1 ? errno : 0);
   2570 }
   2571 
   2572 static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
   2573 {
   2574     struct lo_data *lo = lo_data(req);
   2575     struct lo_inode *inode = lo_inode(req, ino);
   2576     int err;
   2577 
   2578     fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d, kill_priv=%d)"
   2579              "\n", ino, fi->flags, fi->kill_priv);
   2580 
   2581     if (!inode) {
   2582         fuse_reply_err(req, EBADF);
   2583         return;
   2584     }
   2585 
   2586     err = lo_do_open(lo, inode, -1, fi);
   2587     lo_inode_put(lo, &inode);
   2588     if (err) {
   2589         fuse_reply_err(req, err);
   2590     } else {
   2591         fuse_reply_open(req, fi);
   2592     }
   2593 }
   2594 
   2595 static void lo_release(fuse_req_t req, fuse_ino_t ino,
   2596                        struct fuse_file_info *fi)
   2597 {
   2598     struct lo_data *lo = lo_data(req);
   2599     struct lo_map_elem *elem;
   2600     int fd = -1;
   2601 
   2602     (void)ino;
   2603 
   2604     pthread_mutex_lock(&lo->mutex);
   2605     elem = lo_map_get(&lo->fd_map, fi->fh);
   2606     if (elem) {
   2607         fd = elem->fd;
   2608         elem = NULL;
   2609         lo_map_remove(&lo->fd_map, fi->fh);
   2610     }
   2611     pthread_mutex_unlock(&lo->mutex);
   2612 
   2613     close(fd);
   2614     fuse_reply_err(req, 0);
   2615 }
   2616 
   2617 static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
   2618 {
   2619     int res;
   2620     (void)ino;
   2621     struct lo_inode *inode;
   2622     struct lo_data *lo = lo_data(req);
   2623 
   2624     inode = lo_inode(req, ino);
   2625     if (!inode) {
   2626         fuse_reply_err(req, EBADF);
   2627         return;
   2628     }
   2629 
   2630     if (!S_ISREG(inode->filetype)) {
   2631         lo_inode_put(lo, &inode);
   2632         fuse_reply_err(req, EBADF);
   2633         return;
   2634     }
   2635 
   2636     /* An fd is going away. Cleanup associated posix locks */
   2637     if (lo->posix_lock) {
   2638         pthread_mutex_lock(&inode->plock_mutex);
   2639         g_hash_table_remove(inode->posix_locks,
   2640             GUINT_TO_POINTER(fi->lock_owner));
   2641         pthread_mutex_unlock(&inode->plock_mutex);
   2642     }
   2643     res = close(dup(lo_fi_fd(req, fi)));
   2644     lo_inode_put(lo, &inode);
   2645     fuse_reply_err(req, res == -1 ? errno : 0);
   2646 }
   2647 
   2648 static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
   2649                      struct fuse_file_info *fi)
   2650 {
   2651     struct lo_inode *inode = lo_inode(req, ino);
   2652     struct lo_data *lo = lo_data(req);
   2653     int res;
   2654     int fd;
   2655 
   2656     fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
   2657              (void *)fi);
   2658 
   2659     if (!inode) {
   2660         fuse_reply_err(req, EBADF);
   2661         return;
   2662     }
   2663 
   2664     if (!fi) {
   2665         fd = lo_inode_open(lo, inode, O_RDWR);
   2666         if (fd < 0) {
   2667             res = -fd;
   2668             goto out;
   2669         }
   2670     } else {
   2671         fd = lo_fi_fd(req, fi);
   2672     }
   2673 
   2674     if (datasync) {
   2675         res = fdatasync(fd) == -1 ? errno : 0;
   2676     } else {
   2677         res = fsync(fd) == -1 ? errno : 0;
   2678     }
   2679     if (!fi) {
   2680         close(fd);
   2681     }
   2682 out:
   2683     lo_inode_put(lo, &inode);
   2684     fuse_reply_err(req, res);
   2685 }
   2686 
   2687 static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
   2688                     struct fuse_file_info *fi)
   2689 {
   2690     struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
   2691 
   2692     fuse_log(FUSE_LOG_DEBUG,
   2693              "lo_read(ino=%" PRIu64 ", size=%zd, "
   2694              "off=%lu)\n",
   2695              ino, size, (unsigned long)offset);
   2696 
   2697     buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
   2698     buf.buf[0].fd = lo_fi_fd(req, fi);
   2699     buf.buf[0].pos = offset;
   2700 
   2701     fuse_reply_data(req, &buf);
   2702 }
   2703 
   2704 static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
   2705                          struct fuse_bufvec *in_buf, off_t off,
   2706                          struct fuse_file_info *fi)
   2707 {
   2708     (void)ino;
   2709     ssize_t res;
   2710     struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
   2711     bool cap_fsetid_dropped = false;
   2712 
   2713     out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
   2714     out_buf.buf[0].fd = lo_fi_fd(req, fi);
   2715     out_buf.buf[0].pos = off;
   2716 
   2717     fuse_log(FUSE_LOG_DEBUG,
   2718              "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu kill_priv=%d)\n",
   2719              ino, out_buf.buf[0].size, (unsigned long)off, fi->kill_priv);
   2720 
   2721     res = drop_security_capability(lo_data(req), out_buf.buf[0].fd);
   2722     if (res) {
   2723         fuse_reply_err(req, res);
   2724         return;
   2725     }
   2726 
   2727     /*
   2728      * If kill_priv is set, drop CAP_FSETID which should lead to kernel
   2729      * clearing setuid/setgid on file. Note, for WRITE, we need to do
   2730      * this even if killpriv_v2 is not enabled. fuse direct write path
   2731      * relies on this.
   2732      */
   2733     if (fi->kill_priv) {
   2734         res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
   2735         if (res != 0) {
   2736             fuse_reply_err(req, res);
   2737             return;
   2738         }
   2739     }
   2740 
   2741     res = fuse_buf_copy(&out_buf, in_buf);
   2742     if (res < 0) {
   2743         fuse_reply_err(req, -res);
   2744     } else {
   2745         fuse_reply_write(req, (size_t)res);
   2746     }
   2747 
   2748     if (cap_fsetid_dropped) {
   2749         res = gain_effective_cap("FSETID");
   2750         if (res) {
   2751             fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
   2752         }
   2753     }
   2754 }
   2755 
   2756 static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
   2757 {
   2758     int res;
   2759     struct statvfs stbuf;
   2760 
   2761     res = fstatvfs(lo_fd(req, ino), &stbuf);
   2762     if (res == -1) {
   2763         fuse_reply_err(req, errno);
   2764     } else {
   2765         fuse_reply_statfs(req, &stbuf);
   2766     }
   2767 }
   2768 
   2769 static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
   2770                          off_t length, struct fuse_file_info *fi)
   2771 {
   2772     int err = EOPNOTSUPP;
   2773     (void)ino;
   2774 
   2775 #ifdef CONFIG_FALLOCATE
   2776     err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
   2777     if (err < 0) {
   2778         err = errno;
   2779     }
   2780 
   2781 #elif defined(CONFIG_POSIX_FALLOCATE)
   2782     if (mode) {
   2783         fuse_reply_err(req, EOPNOTSUPP);
   2784         return;
   2785     }
   2786 
   2787     err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
   2788 #endif
   2789 
   2790     fuse_reply_err(req, err);
   2791 }
   2792 
   2793 static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
   2794                      int op)
   2795 {
   2796     int res;
   2797     (void)ino;
   2798 
   2799     if (!(op & LOCK_NB)) {
   2800         /*
   2801          * Blocking flock can deadlock as there is only one thread
   2802          * serving the queue.
   2803          */
   2804         fuse_reply_err(req, EOPNOTSUPP);
   2805         return;
   2806     }
   2807 
   2808     res = flock(lo_fi_fd(req, fi), op);
   2809 
   2810     fuse_reply_err(req, res == -1 ? errno : 0);
   2811 }
   2812 
   2813 /* types */
   2814 /*
   2815  * Exit; process attribute unmodified if matched.
   2816  * An empty key applies to all.
   2817  */
   2818 #define XATTR_MAP_FLAG_OK      (1 <<  0)
   2819 /*
   2820  * The attribute is unwanted;
   2821  * EPERM on write, hidden on read.
   2822  */
   2823 #define XATTR_MAP_FLAG_BAD     (1 <<  1)
   2824 /*
   2825  * For attr that start with 'key' prepend 'prepend'
   2826  * 'key' may be empty to prepend for all attrs
   2827  * key is defined from set/remove point of view.
   2828  * Automatically reversed on read
   2829  */
   2830 #define XATTR_MAP_FLAG_PREFIX  (1 <<  2)
   2831 /*
   2832  * The attribute is unsupported;
   2833  * ENOTSUP on write, hidden on read.
   2834  */
   2835 #define XATTR_MAP_FLAG_UNSUPPORTED     (1 <<  3)
   2836 
   2837 /* scopes */
   2838 /* Apply rule to get/set/remove */
   2839 #define XATTR_MAP_FLAG_CLIENT  (1 << 16)
   2840 /* Apply rule to list */
   2841 #define XATTR_MAP_FLAG_SERVER  (1 << 17)
   2842 /* Apply rule to all */
   2843 #define XATTR_MAP_FLAG_ALL   (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT)
   2844 
   2845 static void add_xattrmap_entry(struct lo_data *lo,
   2846                                const XattrMapEntry *new_entry)
   2847 {
   2848     XattrMapEntry *res = g_realloc_n(lo->xattr_map_list,
   2849                                      lo->xattr_map_nentries + 1,
   2850                                      sizeof(XattrMapEntry));
   2851     res[lo->xattr_map_nentries++] = *new_entry;
   2852 
   2853     lo->xattr_map_list = res;
   2854 }
   2855 
   2856 static void free_xattrmap(struct lo_data *lo)
   2857 {
   2858     XattrMapEntry *map = lo->xattr_map_list;
   2859     size_t i;
   2860 
   2861     if (!map) {
   2862         return;
   2863     }
   2864 
   2865     for (i = 0; i < lo->xattr_map_nentries; i++) {
   2866         g_free(map[i].key);
   2867         g_free(map[i].prepend);
   2868     };
   2869 
   2870     g_free(map);
   2871     lo->xattr_map_list = NULL;
   2872     lo->xattr_map_nentries = -1;
   2873 }
   2874 
   2875 /*
   2876  * Handle the 'map' type, which is sugar for a set of commands
   2877  * for the common case of prefixing a subset or everything,
   2878  * and allowing anything not prefixed through.
   2879  * It must be the last entry in the stream, although there
   2880  * can be other entries before it.
   2881  * The form is:
   2882  *    :map:key:prefix:
   2883  *
   2884  * key maybe empty in which case all entries are prefixed.
   2885  */
   2886 static void parse_xattrmap_map(struct lo_data *lo,
   2887                                const char *rule, char sep)
   2888 {
   2889     const char *tmp;
   2890     char *key;
   2891     char *prefix;
   2892     XattrMapEntry tmp_entry;
   2893 
   2894     if (*rule != sep) {
   2895         fuse_log(FUSE_LOG_ERR,
   2896                  "%s: Expecting '%c' after 'map' keyword, found '%c'\n",
   2897                  __func__, sep, *rule);
   2898         exit(1);
   2899     }
   2900 
   2901     rule++;
   2902 
   2903     /* At start of 'key' field */
   2904     tmp = strchr(rule, sep);
   2905     if (!tmp) {
   2906         fuse_log(FUSE_LOG_ERR,
   2907                  "%s: Missing '%c' at end of key field in map rule\n",
   2908                  __func__, sep);
   2909         exit(1);
   2910     }
   2911 
   2912     key = g_strndup(rule, tmp - rule);
   2913     rule = tmp + 1;
   2914 
   2915     /* At start of prefix field */
   2916     tmp = strchr(rule, sep);
   2917     if (!tmp) {
   2918         fuse_log(FUSE_LOG_ERR,
   2919                  "%s: Missing '%c' at end of prefix field in map rule\n",
   2920                  __func__, sep);
   2921         exit(1);
   2922     }
   2923 
   2924     prefix = g_strndup(rule, tmp - rule);
   2925     rule = tmp + 1;
   2926 
   2927     /*
   2928      * This should be the end of the string, we don't allow
   2929      * any more commands after 'map'.
   2930      */
   2931     if (*rule) {
   2932         fuse_log(FUSE_LOG_ERR,
   2933                  "%s: Expecting end of command after map, found '%c'\n",
   2934                  __func__, *rule);
   2935         exit(1);
   2936     }
   2937 
   2938     /* 1st: Prefix matches/everything */
   2939     tmp_entry.flags = XATTR_MAP_FLAG_PREFIX | XATTR_MAP_FLAG_ALL;
   2940     tmp_entry.key = g_strdup(key);
   2941     tmp_entry.prepend = g_strdup(prefix);
   2942     add_xattrmap_entry(lo, &tmp_entry);
   2943 
   2944     if (!*key) {
   2945         /* Prefix all case */
   2946 
   2947         /* 2nd: Hide any non-prefixed entries on the host */
   2948         tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_ALL;
   2949         tmp_entry.key = g_strdup("");
   2950         tmp_entry.prepend = g_strdup("");
   2951         add_xattrmap_entry(lo, &tmp_entry);
   2952     } else {
   2953         /* Prefix matching case */
   2954 
   2955         /* 2nd: Hide non-prefixed but matching entries on the host */
   2956         tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_SERVER;
   2957         tmp_entry.key = g_strdup(""); /* Not used */
   2958         tmp_entry.prepend = g_strdup(key);
   2959         add_xattrmap_entry(lo, &tmp_entry);
   2960 
   2961         /* 3rd: Stop the client accessing prefixed attributes directly */
   2962         tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_CLIENT;
   2963         tmp_entry.key = g_strdup(prefix);
   2964         tmp_entry.prepend = g_strdup(""); /* Not used */
   2965         add_xattrmap_entry(lo, &tmp_entry);
   2966 
   2967         /* 4th: Everything else is OK */
   2968         tmp_entry.flags = XATTR_MAP_FLAG_OK | XATTR_MAP_FLAG_ALL;
   2969         tmp_entry.key = g_strdup("");
   2970         tmp_entry.prepend = g_strdup("");
   2971         add_xattrmap_entry(lo, &tmp_entry);
   2972     }
   2973 
   2974     g_free(key);
   2975     g_free(prefix);
   2976 }
   2977 
   2978 static void parse_xattrmap(struct lo_data *lo)
   2979 {
   2980     const char *map = lo->xattrmap;
   2981     const char *tmp;
   2982     int ret;
   2983 
   2984     lo->xattr_map_nentries = 0;
   2985     while (*map) {
   2986         XattrMapEntry tmp_entry;
   2987         char sep;
   2988 
   2989         if (isspace(*map)) {
   2990             map++;
   2991             continue;
   2992         }
   2993         /* The separator is the first non-space of the rule */
   2994         sep = *map++;
   2995         if (!sep) {
   2996             break;
   2997         }
   2998 
   2999         tmp_entry.flags = 0;
   3000         /* Start of 'type' */
   3001         if (strstart(map, "prefix", &map)) {
   3002             tmp_entry.flags |= XATTR_MAP_FLAG_PREFIX;
   3003         } else if (strstart(map, "ok", &map)) {
   3004             tmp_entry.flags |= XATTR_MAP_FLAG_OK;
   3005         } else if (strstart(map, "bad", &map)) {
   3006             tmp_entry.flags |= XATTR_MAP_FLAG_BAD;
   3007         } else if (strstart(map, "unsupported", &map)) {
   3008             tmp_entry.flags |= XATTR_MAP_FLAG_UNSUPPORTED;
   3009         } else if (strstart(map, "map", &map)) {
   3010             /*
   3011              * map is sugar that adds a number of rules, and must be
   3012              * the last entry.
   3013              */
   3014             parse_xattrmap_map(lo, map, sep);
   3015             break;
   3016         } else {
   3017             fuse_log(FUSE_LOG_ERR,
   3018                      "%s: Unexpected type;"
   3019                      "Expecting 'prefix', 'ok', 'bad', 'unsupported' or 'map'"
   3020                      " in rule %zu\n", __func__, lo->xattr_map_nentries);
   3021             exit(1);
   3022         }
   3023 
   3024         if (*map++ != sep) {
   3025             fuse_log(FUSE_LOG_ERR,
   3026                      "%s: Missing '%c' at end of type field of rule %zu\n",
   3027                      __func__, sep, lo->xattr_map_nentries);
   3028             exit(1);
   3029         }
   3030 
   3031         /* Start of 'scope' */
   3032         if (strstart(map, "client", &map)) {
   3033             tmp_entry.flags |= XATTR_MAP_FLAG_CLIENT;
   3034         } else if (strstart(map, "server", &map)) {
   3035             tmp_entry.flags |= XATTR_MAP_FLAG_SERVER;
   3036         } else if (strstart(map, "all", &map)) {
   3037             tmp_entry.flags |= XATTR_MAP_FLAG_ALL;
   3038         } else {
   3039             fuse_log(FUSE_LOG_ERR,
   3040                      "%s: Unexpected scope;"
   3041                      " Expecting 'client', 'server', or 'all', in rule %zu\n",
   3042                      __func__, lo->xattr_map_nentries);
   3043             exit(1);
   3044         }
   3045 
   3046         if (*map++ != sep) {
   3047             fuse_log(FUSE_LOG_ERR,
   3048                      "%s: Expecting '%c' found '%c'"
   3049                      " after scope in rule %zu\n",
   3050                      __func__, sep, *map, lo->xattr_map_nentries);
   3051             exit(1);
   3052         }
   3053 
   3054         /* At start of 'key' field */
   3055         tmp = strchr(map, sep);
   3056         if (!tmp) {
   3057             fuse_log(FUSE_LOG_ERR,
   3058                      "%s: Missing '%c' at end of key field of rule %zu",
   3059                      __func__, sep, lo->xattr_map_nentries);
   3060             exit(1);
   3061         }
   3062         tmp_entry.key = g_strndup(map, tmp - map);
   3063         map = tmp + 1;
   3064 
   3065         /* At start of 'prepend' field */
   3066         tmp = strchr(map, sep);
   3067         if (!tmp) {
   3068             fuse_log(FUSE_LOG_ERR,
   3069                      "%s: Missing '%c' at end of prepend field of rule %zu",
   3070                      __func__, sep, lo->xattr_map_nentries);
   3071             exit(1);
   3072         }
   3073         tmp_entry.prepend = g_strndup(map, tmp - map);
   3074         map = tmp + 1;
   3075 
   3076         add_xattrmap_entry(lo, &tmp_entry);
   3077         /* End of rule - go around again for another rule */
   3078     }
   3079 
   3080     if (!lo->xattr_map_nentries) {
   3081         fuse_log(FUSE_LOG_ERR, "Empty xattr map\n");
   3082         exit(1);
   3083     }
   3084 
   3085     ret = xattr_map_client(lo, "security.capability",
   3086                            &lo->xattr_security_capability);
   3087     if (ret) {
   3088         fuse_log(FUSE_LOG_ERR, "Failed to map security.capability: %s\n",
   3089                 strerror(ret));
   3090         exit(1);
   3091     }
   3092     if (!lo->xattr_security_capability ||
   3093         !strcmp(lo->xattr_security_capability, "security.capability")) {
   3094         /* 1-1 mapping, don't need to do anything */
   3095         free(lo->xattr_security_capability);
   3096         lo->xattr_security_capability = NULL;
   3097     }
   3098 }
   3099 
   3100 /*
   3101  * For use with getxattr/setxattr/removexattr, where the client
   3102  * gives us a name and we may need to choose a different one.
   3103  * Allocates a buffer for the result placing it in *out_name.
   3104  *   If there's no change then *out_name is not set.
   3105  * Returns 0 on success
   3106  * Can return -EPERM to indicate we block a given attribute
   3107  *   (in which case out_name is not allocated)
   3108  * Can return -ENOMEM to indicate out_name couldn't be allocated.
   3109  */
   3110 static int xattr_map_client(const struct lo_data *lo, const char *client_name,
   3111                             char **out_name)
   3112 {
   3113     size_t i;
   3114     for (i = 0; i < lo->xattr_map_nentries; i++) {
   3115         const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
   3116 
   3117         if ((cur_entry->flags & XATTR_MAP_FLAG_CLIENT) &&
   3118             (strstart(client_name, cur_entry->key, NULL))) {
   3119             if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
   3120                 return -EPERM;
   3121             }
   3122             if (cur_entry->flags & XATTR_MAP_FLAG_UNSUPPORTED) {
   3123                 return -ENOTSUP;
   3124             }
   3125             if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
   3126                 /* Unmodified name */
   3127                 return 0;
   3128             }
   3129             if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
   3130                 *out_name = g_try_malloc(strlen(client_name) +
   3131                                          strlen(cur_entry->prepend) + 1);
   3132                 if (!*out_name) {
   3133                     return -ENOMEM;
   3134                 }
   3135                 sprintf(*out_name, "%s%s", cur_entry->prepend, client_name);
   3136                 return 0;
   3137             }
   3138         }
   3139     }
   3140 
   3141     return -EPERM;
   3142 }
   3143 
   3144 /*
   3145  * For use with listxattr where the server fs gives us a name and we may need
   3146  * to sanitize this for the client.
   3147  * Returns a pointer to the result in *out_name
   3148  *   This is always the original string or the current string with some prefix
   3149  *   removed; no reallocation is done.
   3150  * Returns 0 on success
   3151  * Can return -ENODATA to indicate the name should be dropped from the list.
   3152  */
   3153 static int xattr_map_server(const struct lo_data *lo, const char *server_name,
   3154                             const char **out_name)
   3155 {
   3156     size_t i;
   3157     const char *end;
   3158 
   3159     for (i = 0; i < lo->xattr_map_nentries; i++) {
   3160         const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
   3161 
   3162         if ((cur_entry->flags & XATTR_MAP_FLAG_SERVER) &&
   3163             (strstart(server_name, cur_entry->prepend, &end))) {
   3164             if (cur_entry->flags & XATTR_MAP_FLAG_BAD ||
   3165                 cur_entry->flags & XATTR_MAP_FLAG_UNSUPPORTED) {
   3166                 return -ENODATA;
   3167             }
   3168             if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
   3169                 *out_name = server_name;
   3170                 return 0;
   3171             }
   3172             if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
   3173                 /* Remove prefix */
   3174                 *out_name = end;
   3175                 return 0;
   3176             }
   3177         }
   3178     }
   3179 
   3180     return -ENODATA;
   3181 }
   3182 
   3183 static bool block_xattr(struct lo_data *lo, const char *name)
   3184 {
   3185     /*
   3186      * If user explicitly enabled posix_acl or did not provide any option,
   3187      * do not block acl. Otherwise block system.posix_acl_access and
   3188      * system.posix_acl_default xattrs.
   3189      */
   3190     if (lo->user_posix_acl) {
   3191         return false;
   3192     }
   3193     if (!strcmp(name, "system.posix_acl_access") ||
   3194         !strcmp(name, "system.posix_acl_default"))
   3195             return true;
   3196 
   3197     return false;
   3198 }
   3199 
   3200 /*
   3201  * Returns number of bytes in xattr_list after filtering on success. This
   3202  * could be zero as well if nothing is left after filtering.
   3203  *
   3204  * Returns negative error code on failure.
   3205  * xattr_list is modified in place.
   3206  */
   3207 static int remove_blocked_xattrs(struct lo_data *lo, char *xattr_list,
   3208                                  unsigned in_size)
   3209 {
   3210     size_t out_index, in_index;
   3211 
   3212     /*
   3213      * As of now we only filter out acl xattrs. If acls are enabled or
   3214      * they have not been explicitly disabled, there is nothing to
   3215      * filter.
   3216      */
   3217     if (lo->user_posix_acl) {
   3218         return in_size;
   3219     }
   3220 
   3221     out_index = 0;
   3222     in_index = 0;
   3223     while (in_index < in_size) {
   3224         char *in_ptr = xattr_list + in_index;
   3225 
   3226         /* Length of current attribute name */
   3227         size_t in_len = strlen(xattr_list + in_index) + 1;
   3228 
   3229         if (!block_xattr(lo, in_ptr)) {
   3230             if (in_index != out_index) {
   3231                 memmove(xattr_list + out_index, xattr_list + in_index, in_len);
   3232             }
   3233             out_index += in_len;
   3234         }
   3235         in_index += in_len;
   3236      }
   3237     return out_index;
   3238 }
   3239 
   3240 static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
   3241                         size_t size)
   3242 {
   3243     struct lo_data *lo = lo_data(req);
   3244     g_autofree char *value = NULL;
   3245     char procname[64];
   3246     const char *name;
   3247     char *mapped_name;
   3248     struct lo_inode *inode;
   3249     ssize_t ret;
   3250     int saverr;
   3251     int fd = -1;
   3252 
   3253     if (block_xattr(lo, in_name)) {
   3254         fuse_reply_err(req, EOPNOTSUPP);
   3255         return;
   3256     }
   3257 
   3258     mapped_name = NULL;
   3259     name = in_name;
   3260     if (lo->xattrmap) {
   3261         ret = xattr_map_client(lo, in_name, &mapped_name);
   3262         if (ret < 0) {
   3263             if (ret == -EPERM) {
   3264                 ret = -ENODATA;
   3265             }
   3266             fuse_reply_err(req, -ret);
   3267             return;
   3268         }
   3269         if (mapped_name) {
   3270             name = mapped_name;
   3271         }
   3272     }
   3273 
   3274     inode = lo_inode(req, ino);
   3275     if (!inode) {
   3276         fuse_reply_err(req, EBADF);
   3277         g_free(mapped_name);
   3278         return;
   3279     }
   3280 
   3281     saverr = ENOSYS;
   3282     if (!lo_data(req)->xattr) {
   3283         goto out;
   3284     }
   3285 
   3286     fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
   3287              ino, name, size);
   3288 
   3289     if (size) {
   3290         value = g_try_malloc(size);
   3291         if (!value) {
   3292             goto out_err;
   3293         }
   3294     }
   3295 
   3296     sprintf(procname, "%i", inode->fd);
   3297     /*
   3298      * It is not safe to open() non-regular/non-dir files in file server
   3299      * unless O_PATH is used, so use that method for regular files/dir
   3300      * only (as it seems giving less performance overhead).
   3301      * Otherwise, call fchdir() to avoid open().
   3302      */
   3303     if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
   3304         fd = openat(lo->proc_self_fd, procname, O_RDONLY);
   3305         if (fd < 0) {
   3306             goto out_err;
   3307         }
   3308         ret = fgetxattr(fd, name, value, size);
   3309         saverr = ret == -1 ? errno : 0;
   3310     } else {
   3311         /* fchdir should not fail here */
   3312         FCHDIR_NOFAIL(lo->proc_self_fd);
   3313         ret = getxattr(procname, name, value, size);
   3314         saverr = ret == -1 ? errno : 0;
   3315         FCHDIR_NOFAIL(lo->root.fd);
   3316     }
   3317 
   3318     if (ret == -1) {
   3319         goto out;
   3320     }
   3321     if (size) {
   3322         saverr = 0;
   3323         if (ret == 0) {
   3324             goto out;
   3325         }
   3326         fuse_reply_buf(req, value, ret);
   3327     } else {
   3328         fuse_reply_xattr(req, ret);
   3329     }
   3330 out_free:
   3331     if (fd >= 0) {
   3332         close(fd);
   3333     }
   3334 
   3335     lo_inode_put(lo, &inode);
   3336     return;
   3337 
   3338 out_err:
   3339     saverr = errno;
   3340 out:
   3341     fuse_reply_err(req, saverr);
   3342     g_free(mapped_name);
   3343     goto out_free;
   3344 }
   3345 
   3346 static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
   3347 {
   3348     struct lo_data *lo = lo_data(req);
   3349     g_autofree char *value = NULL;
   3350     char procname[64];
   3351     struct lo_inode *inode;
   3352     ssize_t ret;
   3353     int saverr;
   3354     int fd = -1;
   3355 
   3356     inode = lo_inode(req, ino);
   3357     if (!inode) {
   3358         fuse_reply_err(req, EBADF);
   3359         return;
   3360     }
   3361 
   3362     saverr = ENOSYS;
   3363     if (!lo_data(req)->xattr) {
   3364         goto out;
   3365     }
   3366 
   3367     fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
   3368              size);
   3369 
   3370     if (size) {
   3371         value = g_try_malloc(size);
   3372         if (!value) {
   3373             goto out_err;
   3374         }
   3375     }
   3376 
   3377     sprintf(procname, "%i", inode->fd);
   3378     if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
   3379         fd = openat(lo->proc_self_fd, procname, O_RDONLY);
   3380         if (fd < 0) {
   3381             goto out_err;
   3382         }
   3383         ret = flistxattr(fd, value, size);
   3384         saverr = ret == -1 ? errno : 0;
   3385     } else {
   3386         /* fchdir should not fail here */
   3387         FCHDIR_NOFAIL(lo->proc_self_fd);
   3388         ret = listxattr(procname, value, size);
   3389         saverr = ret == -1 ? errno : 0;
   3390         FCHDIR_NOFAIL(lo->root.fd);
   3391     }
   3392 
   3393     if (ret == -1) {
   3394         goto out;
   3395     }
   3396     if (size) {
   3397         saverr = 0;
   3398         if (ret == 0) {
   3399             goto out;
   3400         }
   3401 
   3402         if (lo->xattr_map_list) {
   3403             /*
   3404              * Map the names back, some attributes might be dropped,
   3405              * some shortened, but not increased, so we shouldn't
   3406              * run out of room.
   3407              */
   3408             size_t out_index, in_index;
   3409             out_index = 0;
   3410             in_index = 0;
   3411             while (in_index < ret) {
   3412                 const char *map_out;
   3413                 char *in_ptr = value + in_index;
   3414                 /* Length of current attribute name */
   3415                 size_t in_len = strlen(value + in_index) + 1;
   3416 
   3417                 int mapret = xattr_map_server(lo, in_ptr, &map_out);
   3418                 if (mapret != -ENODATA && mapret != 0) {
   3419                     /* Shouldn't happen */
   3420                     saverr = -mapret;
   3421                     goto out;
   3422                 }
   3423                 if (mapret == 0) {
   3424                     /* Either unchanged, or truncated */
   3425                     size_t out_len;
   3426                     if (map_out != in_ptr) {
   3427                         /* +1 copies the NIL */
   3428                         out_len = strlen(map_out) + 1;
   3429                     } else {
   3430                         /* No change */
   3431                         out_len = in_len;
   3432                     }
   3433                     /*
   3434                      * Move result along, may still be needed for an unchanged
   3435                      * entry if a previous entry was changed.
   3436                      */
   3437                     memmove(value + out_index, map_out, out_len);
   3438 
   3439                     out_index += out_len;
   3440                 }
   3441                 in_index += in_len;
   3442             }
   3443             ret = out_index;
   3444             if (ret == 0) {
   3445                 goto out;
   3446             }
   3447         }
   3448 
   3449         ret = remove_blocked_xattrs(lo, value, ret);
   3450         if (ret <= 0) {
   3451             saverr = -ret;
   3452             goto out;
   3453         }
   3454         fuse_reply_buf(req, value, ret);
   3455     } else {
   3456         /*
   3457          * xattrmap only ever shortens the result,
   3458          * so we don't need to do anything clever with the
   3459          * allocation length here.
   3460          */
   3461         fuse_reply_xattr(req, ret);
   3462     }
   3463 out_free:
   3464     if (fd >= 0) {
   3465         close(fd);
   3466     }
   3467 
   3468     lo_inode_put(lo, &inode);
   3469     return;
   3470 
   3471 out_err:
   3472     saverr = errno;
   3473 out:
   3474     fuse_reply_err(req, saverr);
   3475     goto out_free;
   3476 }
   3477 
   3478 static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
   3479                         const char *value, size_t size, int flags,
   3480                         uint32_t extra_flags)
   3481 {
   3482     char procname[64];
   3483     const char *name;
   3484     char *mapped_name;
   3485     struct lo_data *lo = lo_data(req);
   3486     struct lo_inode *inode;
   3487     ssize_t ret;
   3488     int saverr;
   3489     int fd = -1;
   3490     bool switched_creds = false;
   3491     bool cap_fsetid_dropped = false;
   3492     struct lo_cred old = {};
   3493 
   3494     if (block_xattr(lo, in_name)) {
   3495         fuse_reply_err(req, EOPNOTSUPP);
   3496         return;
   3497     }
   3498 
   3499     mapped_name = NULL;
   3500     name = in_name;
   3501     if (lo->xattrmap) {
   3502         ret = xattr_map_client(lo, in_name, &mapped_name);
   3503         if (ret < 0) {
   3504             fuse_reply_err(req, -ret);
   3505             return;
   3506         }
   3507         if (mapped_name) {
   3508             name = mapped_name;
   3509         }
   3510     }
   3511 
   3512     inode = lo_inode(req, ino);
   3513     if (!inode) {
   3514         fuse_reply_err(req, EBADF);
   3515         g_free(mapped_name);
   3516         return;
   3517     }
   3518 
   3519     saverr = ENOSYS;
   3520     if (!lo_data(req)->xattr) {
   3521         goto out;
   3522     }
   3523 
   3524     fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
   3525              ", name=%s value=%s size=%zd)\n", ino, name, value, size);
   3526 
   3527     sprintf(procname, "%i", inode->fd);
   3528     /*
   3529      * If we are setting posix access acl and if SGID needs to be
   3530      * cleared, then switch to caller's gid and drop CAP_FSETID
   3531      * and that should make sure host kernel clears SGID.
   3532      *
   3533      * This probably will not work when we support idmapped mounts.
   3534      * In that case we will need to find a non-root gid and switch
   3535      * to it. (Instead of gid in request). Fix it when we support
   3536      * idmapped mounts.
   3537      */
   3538     if (lo->posix_acl && !strcmp(name, "system.posix_acl_access")
   3539         && (extra_flags & FUSE_SETXATTR_ACL_KILL_SGID)) {
   3540         ret = lo_drop_cap_change_cred(req, &old, false, "FSETID",
   3541                                       &cap_fsetid_dropped);
   3542         if (ret) {
   3543             saverr = ret;
   3544             goto out;
   3545         }
   3546         switched_creds = true;
   3547     }
   3548     if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
   3549         fd = openat(lo->proc_self_fd, procname, O_RDONLY);
   3550         if (fd < 0) {
   3551             saverr = errno;
   3552             goto out;
   3553         }
   3554         ret = fsetxattr(fd, name, value, size, flags);
   3555         saverr = ret == -1 ? errno : 0;
   3556     } else {
   3557         /* fchdir should not fail here */
   3558         FCHDIR_NOFAIL(lo->proc_self_fd);
   3559         ret = setxattr(procname, name, value, size, flags);
   3560         saverr = ret == -1 ? errno : 0;
   3561         FCHDIR_NOFAIL(lo->root.fd);
   3562     }
   3563     if (switched_creds) {
   3564         if (cap_fsetid_dropped)
   3565             lo_restore_cred_gain_cap(&old, false, "FSETID");
   3566         else
   3567             lo_restore_cred(&old, false);
   3568     }
   3569 
   3570 out:
   3571     if (fd >= 0) {
   3572         close(fd);
   3573     }
   3574 
   3575     lo_inode_put(lo, &inode);
   3576     g_free(mapped_name);
   3577     fuse_reply_err(req, saverr);
   3578 }
   3579 
   3580 static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
   3581 {
   3582     char procname[64];
   3583     const char *name;
   3584     char *mapped_name;
   3585     struct lo_data *lo = lo_data(req);
   3586     struct lo_inode *inode;
   3587     ssize_t ret;
   3588     int saverr;
   3589     int fd = -1;
   3590 
   3591     if (block_xattr(lo, in_name)) {
   3592         fuse_reply_err(req, EOPNOTSUPP);
   3593         return;
   3594     }
   3595 
   3596     mapped_name = NULL;
   3597     name = in_name;
   3598     if (lo->xattrmap) {
   3599         ret = xattr_map_client(lo, in_name, &mapped_name);
   3600         if (ret < 0) {
   3601             fuse_reply_err(req, -ret);
   3602             return;
   3603         }
   3604         if (mapped_name) {
   3605             name = mapped_name;
   3606         }
   3607     }
   3608 
   3609     inode = lo_inode(req, ino);
   3610     if (!inode) {
   3611         fuse_reply_err(req, EBADF);
   3612         g_free(mapped_name);
   3613         return;
   3614     }
   3615 
   3616     saverr = ENOSYS;
   3617     if (!lo_data(req)->xattr) {
   3618         goto out;
   3619     }
   3620 
   3621     fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
   3622              name);
   3623 
   3624     sprintf(procname, "%i", inode->fd);
   3625     if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
   3626         fd = openat(lo->proc_self_fd, procname, O_RDONLY);
   3627         if (fd < 0) {
   3628             saverr = errno;
   3629             goto out;
   3630         }
   3631         ret = fremovexattr(fd, name);
   3632         saverr = ret == -1 ? errno : 0;
   3633     } else {
   3634         /* fchdir should not fail here */
   3635         FCHDIR_NOFAIL(lo->proc_self_fd);
   3636         ret = removexattr(procname, name);
   3637         saverr = ret == -1 ? errno : 0;
   3638         FCHDIR_NOFAIL(lo->root.fd);
   3639     }
   3640 
   3641 out:
   3642     if (fd >= 0) {
   3643         close(fd);
   3644     }
   3645 
   3646     lo_inode_put(lo, &inode);
   3647     g_free(mapped_name);
   3648     fuse_reply_err(req, saverr);
   3649 }
   3650 
   3651 #ifdef HAVE_COPY_FILE_RANGE
   3652 static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
   3653                                struct fuse_file_info *fi_in, fuse_ino_t ino_out,
   3654                                off_t off_out, struct fuse_file_info *fi_out,
   3655                                size_t len, int flags)
   3656 {
   3657     int in_fd, out_fd;
   3658     ssize_t res;
   3659 
   3660     in_fd = lo_fi_fd(req, fi_in);
   3661     out_fd = lo_fi_fd(req, fi_out);
   3662 
   3663     fuse_log(FUSE_LOG_DEBUG,
   3664              "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
   3665              "off=%ju, ino=%" PRIu64 "/fd=%d, "
   3666              "off=%ju, size=%zd, flags=0x%x)\n",
   3667              ino_in, in_fd, (intmax_t)off_in,
   3668              ino_out, out_fd, (intmax_t)off_out, len, flags);
   3669 
   3670     res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
   3671     if (res < 0) {
   3672         fuse_reply_err(req, errno);
   3673     } else {
   3674         fuse_reply_write(req, res);
   3675     }
   3676 }
   3677 #endif
   3678 
   3679 static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
   3680                      struct fuse_file_info *fi)
   3681 {
   3682     off_t res;
   3683 
   3684     (void)ino;
   3685     res = lseek(lo_fi_fd(req, fi), off, whence);
   3686     if (res != -1) {
   3687         fuse_reply_lseek(req, res);
   3688     } else {
   3689         fuse_reply_err(req, errno);
   3690     }
   3691 }
   3692 
   3693 static int lo_do_syncfs(struct lo_data *lo, struct lo_inode *inode)
   3694 {
   3695     int fd, ret = 0;
   3696 
   3697     fuse_log(FUSE_LOG_DEBUG, "lo_do_syncfs(ino=%" PRIu64 ")\n",
   3698              inode->fuse_ino);
   3699 
   3700     fd = lo_inode_open(lo, inode, O_RDONLY);
   3701     if (fd < 0) {
   3702         return -fd;
   3703     }
   3704 
   3705     if (syncfs(fd) < 0) {
   3706         ret = errno;
   3707     }
   3708 
   3709     close(fd);
   3710     return ret;
   3711 }
   3712 
   3713 static void lo_syncfs(fuse_req_t req, fuse_ino_t ino)
   3714 {
   3715     struct lo_data *lo = lo_data(req);
   3716     struct lo_inode *inode = lo_inode(req, ino);
   3717     int err;
   3718 
   3719     if (!inode) {
   3720         fuse_reply_err(req, EBADF);
   3721         return;
   3722     }
   3723 
   3724     err = lo_do_syncfs(lo, inode);
   3725     lo_inode_put(lo, &inode);
   3726 
   3727     /*
   3728      * If submounts aren't announced, the client only sends a request to
   3729      * sync the root inode. TODO: Track submounts internally and iterate
   3730      * over them as well.
   3731      */
   3732 
   3733     fuse_reply_err(req, err);
   3734 }
   3735 
   3736 static void lo_destroy(void *userdata)
   3737 {
   3738     struct lo_data *lo = (struct lo_data *)userdata;
   3739 
   3740     pthread_mutex_lock(&lo->mutex);
   3741     while (true) {
   3742         GHashTableIter iter;
   3743         gpointer key, value;
   3744 
   3745         g_hash_table_iter_init(&iter, lo->inodes);
   3746         if (!g_hash_table_iter_next(&iter, &key, &value)) {
   3747             break;
   3748         }
   3749 
   3750         struct lo_inode *inode = value;
   3751         unref_inode(lo, inode, inode->nlookup);
   3752     }
   3753     pthread_mutex_unlock(&lo->mutex);
   3754 }
   3755 
   3756 static struct fuse_lowlevel_ops lo_oper = {
   3757     .init = lo_init,
   3758     .lookup = lo_lookup,
   3759     .mkdir = lo_mkdir,
   3760     .mknod = lo_mknod,
   3761     .symlink = lo_symlink,
   3762     .link = lo_link,
   3763     .unlink = lo_unlink,
   3764     .rmdir = lo_rmdir,
   3765     .rename = lo_rename,
   3766     .forget = lo_forget,
   3767     .forget_multi = lo_forget_multi,
   3768     .getattr = lo_getattr,
   3769     .setattr = lo_setattr,
   3770     .readlink = lo_readlink,
   3771     .opendir = lo_opendir,
   3772     .readdir = lo_readdir,
   3773     .readdirplus = lo_readdirplus,
   3774     .releasedir = lo_releasedir,
   3775     .fsyncdir = lo_fsyncdir,
   3776     .create = lo_create,
   3777     .getlk = lo_getlk,
   3778     .setlk = lo_setlk,
   3779     .open = lo_open,
   3780     .release = lo_release,
   3781     .flush = lo_flush,
   3782     .fsync = lo_fsync,
   3783     .read = lo_read,
   3784     .write_buf = lo_write_buf,
   3785     .statfs = lo_statfs,
   3786     .fallocate = lo_fallocate,
   3787     .flock = lo_flock,
   3788     .getxattr = lo_getxattr,
   3789     .listxattr = lo_listxattr,
   3790     .setxattr = lo_setxattr,
   3791     .removexattr = lo_removexattr,
   3792 #ifdef HAVE_COPY_FILE_RANGE
   3793     .copy_file_range = lo_copy_file_range,
   3794 #endif
   3795     .lseek = lo_lseek,
   3796     .syncfs = lo_syncfs,
   3797     .destroy = lo_destroy,
   3798 };
   3799 
   3800 /* Print vhost-user.json backend program capabilities */
   3801 static void print_capabilities(void)
   3802 {
   3803     printf("{\n");
   3804     printf("  \"type\": \"fs\"\n");
   3805     printf("}\n");
   3806 }
   3807 
   3808 /*
   3809  * Drop all Linux capabilities because the wait parent process only needs to
   3810  * sit in waitpid(2) and terminate.
   3811  */
   3812 static void setup_wait_parent_capabilities(void)
   3813 {
   3814     capng_setpid(syscall(SYS_gettid));
   3815     capng_clear(CAPNG_SELECT_BOTH);
   3816     capng_apply(CAPNG_SELECT_BOTH);
   3817 }
   3818 
   3819 /*
   3820  * Move to a new mount, net, and pid namespaces to isolate this process.
   3821  */
   3822 static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
   3823 {
   3824     pid_t child;
   3825 
   3826     /*
   3827      * Create a new pid namespace for *child* processes.  We'll have to
   3828      * fork in order to enter the new pid namespace.  A new mount namespace
   3829      * is also needed so that we can remount /proc for the new pid
   3830      * namespace.
   3831      *
   3832      * Our UNIX domain sockets have been created.  Now we can move to
   3833      * an empty network namespace to prevent TCP/IP and other network
   3834      * activity in case this process is compromised.
   3835      */
   3836     if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
   3837         fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
   3838         exit(1);
   3839     }
   3840 
   3841     child = fork();
   3842     if (child < 0) {
   3843         fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
   3844         exit(1);
   3845     }
   3846     if (child > 0) {
   3847         pid_t waited;
   3848         int wstatus;
   3849 
   3850         setup_wait_parent_capabilities();
   3851 
   3852         /* The parent waits for the child */
   3853         do {
   3854             waited = waitpid(child, &wstatus, 0);
   3855         } while (waited < 0 && errno == EINTR && !se->exited);
   3856 
   3857         /* We were terminated by a signal, see fuse_signals.c */
   3858         if (se->exited) {
   3859             exit(0);
   3860         }
   3861 
   3862         if (WIFEXITED(wstatus)) {
   3863             exit(WEXITSTATUS(wstatus));
   3864         }
   3865 
   3866         exit(1);
   3867     }
   3868 
   3869     /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
   3870     prctl(PR_SET_PDEATHSIG, SIGTERM);
   3871 
   3872     /*
   3873      * If the mounts have shared propagation then we want to opt out so our
   3874      * mount changes don't affect the parent mount namespace.
   3875      */
   3876     if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
   3877         fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
   3878         exit(1);
   3879     }
   3880 
   3881     /* The child must remount /proc to use the new pid namespace */
   3882     if (mount("proc", "/proc", "proc",
   3883               MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
   3884         fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
   3885         exit(1);
   3886     }
   3887 
   3888     /* Get the /proc/self/task descriptor */
   3889     lo->proc_self_task = open("/proc/self/task/", O_PATH);
   3890     if (lo->proc_self_task == -1) {
   3891         fuse_log(FUSE_LOG_ERR, "open(/proc/self/task, O_PATH): %m\n");
   3892         exit(1);
   3893     }
   3894 
   3895     lo->use_fscreate = is_fscreate_usable(lo);
   3896 
   3897     /*
   3898      * We only need /proc/self/fd. Prevent ".." from accessing parent
   3899      * directories of /proc/self/fd by bind-mounting it over /proc. Since / was
   3900      * previously remounted with MS_REC | MS_SLAVE this mount change only
   3901      * affects our process.
   3902      */
   3903     if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) {
   3904         fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n");
   3905         exit(1);
   3906     }
   3907 
   3908     /* Get the /proc (actually /proc/self/fd, see above) file descriptor */
   3909     lo->proc_self_fd = open("/proc", O_PATH);
   3910     if (lo->proc_self_fd == -1) {
   3911         fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n");
   3912         exit(1);
   3913     }
   3914 }
   3915 
   3916 /*
   3917  * Capture the capability state, we'll need to restore this for individual
   3918  * threads later; see load_capng.
   3919  */
   3920 static void setup_capng(void)
   3921 {
   3922     /* Note this accesses /proc so has to happen before the sandbox */
   3923     if (capng_get_caps_process()) {
   3924         fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
   3925         exit(1);
   3926     }
   3927     pthread_mutex_init(&cap.mutex, NULL);
   3928     pthread_mutex_lock(&cap.mutex);
   3929     cap.saved = capng_save_state();
   3930     if (!cap.saved) {
   3931         fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
   3932         exit(1);
   3933     }
   3934     pthread_mutex_unlock(&cap.mutex);
   3935 }
   3936 
   3937 static void cleanup_capng(void)
   3938 {
   3939     free(cap.saved);
   3940     cap.saved = NULL;
   3941     pthread_mutex_destroy(&cap.mutex);
   3942 }
   3943 
   3944 
   3945 /*
   3946  * Make the source directory our root so symlinks cannot escape and no other
   3947  * files are accessible.  Assumes unshare(CLONE_NEWNS) was already called.
   3948  */
   3949 static void setup_mounts(const char *source)
   3950 {
   3951     int oldroot;
   3952     int newroot;
   3953 
   3954     if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
   3955         fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
   3956         exit(1);
   3957     }
   3958 
   3959     /* This magic is based on lxc's lxc_pivot_root() */
   3960     oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
   3961     if (oldroot < 0) {
   3962         fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
   3963         exit(1);
   3964     }
   3965 
   3966     newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
   3967     if (newroot < 0) {
   3968         fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
   3969         exit(1);
   3970     }
   3971 
   3972     if (fchdir(newroot) < 0) {
   3973         fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
   3974         exit(1);
   3975     }
   3976 
   3977     if (syscall(__NR_pivot_root, ".", ".") < 0) {
   3978         fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
   3979         exit(1);
   3980     }
   3981 
   3982     if (fchdir(oldroot) < 0) {
   3983         fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
   3984         exit(1);
   3985     }
   3986 
   3987     if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
   3988         fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
   3989         exit(1);
   3990     }
   3991 
   3992     if (umount2(".", MNT_DETACH) < 0) {
   3993         fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
   3994         exit(1);
   3995     }
   3996 
   3997     if (fchdir(newroot) < 0) {
   3998         fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
   3999         exit(1);
   4000     }
   4001 
   4002     close(newroot);
   4003     close(oldroot);
   4004 }
   4005 
   4006 /*
   4007  * Only keep capabilities in allowlist that are needed for file system operation
   4008  * The (possibly NULL) modcaps_in string passed in is free'd before exit.
   4009  */
   4010 static void setup_capabilities(char *modcaps_in)
   4011 {
   4012     char *modcaps = modcaps_in;
   4013     pthread_mutex_lock(&cap.mutex);
   4014     capng_restore_state(&cap.saved);
   4015 
   4016     /*
   4017      * Add to allowlist file system-related capabilities that are needed for a
   4018      * file server to act like root.  Drop everything else like networking and
   4019      * sysadmin capabilities.
   4020      *
   4021      * Exclusions:
   4022      * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
   4023      *    and we don't support that.
   4024      * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
   4025      *    used by the Smack LSM.  Omit it until there is demand for it.
   4026      */
   4027     capng_setpid(syscall(SYS_gettid));
   4028     capng_clear(CAPNG_SELECT_BOTH);
   4029     if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
   4030             CAP_CHOWN,
   4031             CAP_DAC_OVERRIDE,
   4032             CAP_FOWNER,
   4033             CAP_FSETID,
   4034             CAP_SETGID,
   4035             CAP_SETUID,
   4036             CAP_MKNOD,
   4037             CAP_SETFCAP,
   4038             -1)) {
   4039         fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__);
   4040         exit(1);
   4041     }
   4042 
   4043     /*
   4044      * The modcaps option is a colon separated list of caps,
   4045      * each preceded by either + or -.
   4046      */
   4047     while (modcaps) {
   4048         capng_act_t action;
   4049         int cap;
   4050 
   4051         char *next = strchr(modcaps, ':');
   4052         if (next) {
   4053             *next = '\0';
   4054             next++;
   4055         }
   4056 
   4057         switch (modcaps[0]) {
   4058         case '+':
   4059             action = CAPNG_ADD;
   4060             break;
   4061 
   4062         case '-':
   4063             action = CAPNG_DROP;
   4064             break;
   4065 
   4066         default:
   4067             fuse_log(FUSE_LOG_ERR,
   4068                      "%s: Expecting '+'/'-' in modcaps but found '%c'\n",
   4069                      __func__, modcaps[0]);
   4070             exit(1);
   4071         }
   4072         cap = capng_name_to_capability(modcaps + 1);
   4073         if (cap < 0) {
   4074             fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__,
   4075                      modcaps);
   4076             exit(1);
   4077         }
   4078         if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) {
   4079             fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n",
   4080                      __func__, modcaps);
   4081             exit(1);
   4082         }
   4083 
   4084         modcaps = next;
   4085     }
   4086     g_free(modcaps_in);
   4087 
   4088     if (capng_apply(CAPNG_SELECT_BOTH)) {
   4089         fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__);
   4090         exit(1);
   4091     }
   4092 
   4093     cap.saved = capng_save_state();
   4094     if (!cap.saved) {
   4095         fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__);
   4096         exit(1);
   4097     }
   4098     pthread_mutex_unlock(&cap.mutex);
   4099 }
   4100 
   4101 /*
   4102  * Use chroot as a weaker sandbox for environments where the process is
   4103  * launched without CAP_SYS_ADMIN.
   4104  */
   4105 static void setup_chroot(struct lo_data *lo)
   4106 {
   4107     lo->proc_self_fd = open("/proc/self/fd", O_PATH);
   4108     if (lo->proc_self_fd == -1) {
   4109         fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/fd\", O_PATH): %m\n");
   4110         exit(1);
   4111     }
   4112 
   4113     lo->proc_self_task = open("/proc/self/task", O_PATH);
   4114     if (lo->proc_self_fd == -1) {
   4115         fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/task\", O_PATH): %m\n");
   4116         exit(1);
   4117     }
   4118 
   4119     lo->use_fscreate = is_fscreate_usable(lo);
   4120 
   4121     /*
   4122      * Make the shared directory the file system root so that FUSE_OPEN
   4123      * (lo_open()) cannot escape the shared directory by opening a symlink.
   4124      *
   4125      * The chroot(2) syscall is later disabled by seccomp and the
   4126      * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot
   4127      * is not possible.
   4128      *
   4129      * However, it's still possible to escape the chroot via lo->proc_self_fd
   4130      * but that requires first gaining control of the process.
   4131      */
   4132     if (chroot(lo->source) != 0) {
   4133         fuse_log(FUSE_LOG_ERR, "chroot(\"%s\"): %m\n", lo->source);
   4134         exit(1);
   4135     }
   4136 
   4137     /* Move into the chroot */
   4138     if (chdir("/") != 0) {
   4139         fuse_log(FUSE_LOG_ERR, "chdir(\"/\"): %m\n");
   4140         exit(1);
   4141     }
   4142 }
   4143 
   4144 /*
   4145  * Lock down this process to prevent access to other processes or files outside
   4146  * source directory.  This reduces the impact of arbitrary code execution bugs.
   4147  */
   4148 static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
   4149                           bool enable_syslog)
   4150 {
   4151     if (lo->sandbox == SANDBOX_NAMESPACE) {
   4152         setup_namespaces(lo, se);
   4153         setup_mounts(lo->source);
   4154     } else {
   4155         setup_chroot(lo);
   4156     }
   4157 
   4158     setup_seccomp(enable_syslog);
   4159     setup_capabilities(g_strdup(lo->modcaps));
   4160 }
   4161 
   4162 /* Set the maximum number of open file descriptors */
   4163 static void setup_nofile_rlimit(unsigned long rlimit_nofile)
   4164 {
   4165     struct rlimit rlim = {
   4166         .rlim_cur = rlimit_nofile,
   4167         .rlim_max = rlimit_nofile,
   4168     };
   4169 
   4170     if (rlimit_nofile == 0) {
   4171         return; /* nothing to do */
   4172     }
   4173 
   4174     if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
   4175         /* Ignore SELinux denials */
   4176         if (errno == EPERM) {
   4177             return;
   4178         }
   4179 
   4180         fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
   4181         exit(1);
   4182     }
   4183 }
   4184 
   4185 static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
   4186 {
   4187     g_autofree char *localfmt = NULL;
   4188     char buf[64];
   4189 
   4190     if (current_log_level < level) {
   4191         return;
   4192     }
   4193 
   4194     if (current_log_level == FUSE_LOG_DEBUG) {
   4195         if (use_syslog) {
   4196             /* no timestamp needed */
   4197             localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
   4198                                        fmt);
   4199         } else {
   4200             g_autoptr(GDateTime) now = g_date_time_new_now_utc();
   4201             g_autofree char *nowstr = g_date_time_format(now,
   4202                                        "%Y-%m-%d %H:%M:%S.%%06d%z");
   4203             snprintf(buf, 64, nowstr, g_date_time_get_microsecond(now));
   4204             localfmt = g_strdup_printf("[%s] [ID: %08ld] %s",
   4205                                        buf, syscall(__NR_gettid), fmt);
   4206         }
   4207         fmt = localfmt;
   4208     }
   4209 
   4210     if (use_syslog) {
   4211         int priority = LOG_ERR;
   4212         switch (level) {
   4213         case FUSE_LOG_EMERG:
   4214             priority = LOG_EMERG;
   4215             break;
   4216         case FUSE_LOG_ALERT:
   4217             priority = LOG_ALERT;
   4218             break;
   4219         case FUSE_LOG_CRIT:
   4220             priority = LOG_CRIT;
   4221             break;
   4222         case FUSE_LOG_ERR:
   4223             priority = LOG_ERR;
   4224             break;
   4225         case FUSE_LOG_WARNING:
   4226             priority = LOG_WARNING;
   4227             break;
   4228         case FUSE_LOG_NOTICE:
   4229             priority = LOG_NOTICE;
   4230             break;
   4231         case FUSE_LOG_INFO:
   4232             priority = LOG_INFO;
   4233             break;
   4234         case FUSE_LOG_DEBUG:
   4235             priority = LOG_DEBUG;
   4236             break;
   4237         }
   4238         vsyslog(priority, fmt, ap);
   4239     } else {
   4240         vfprintf(stderr, fmt, ap);
   4241     }
   4242 }
   4243 
   4244 static void setup_root(struct lo_data *lo, struct lo_inode *root)
   4245 {
   4246     int fd, res;
   4247     struct stat stat;
   4248     uint64_t mnt_id;
   4249 
   4250     fd = open("/", O_PATH);
   4251     if (fd == -1) {
   4252         fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
   4253         exit(1);
   4254     }
   4255 
   4256     res = do_statx(lo, fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
   4257                    &mnt_id);
   4258     if (res == -1) {
   4259         fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
   4260         exit(1);
   4261     }
   4262 
   4263     root->filetype = S_IFDIR;
   4264     root->fd = fd;
   4265     root->key.ino = stat.st_ino;
   4266     root->key.dev = stat.st_dev;
   4267     root->key.mnt_id = mnt_id;
   4268     root->nlookup = 2;
   4269     g_atomic_int_set(&root->refcount, 2);
   4270     if (lo->posix_lock) {
   4271         pthread_mutex_init(&root->plock_mutex, NULL);
   4272         root->posix_locks = g_hash_table_new_full(
   4273             g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
   4274     }
   4275 }
   4276 
   4277 static guint lo_key_hash(gconstpointer key)
   4278 {
   4279     const struct lo_key *lkey = key;
   4280 
   4281     return (guint)lkey->ino + (guint)lkey->dev + (guint)lkey->mnt_id;
   4282 }
   4283 
   4284 static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
   4285 {
   4286     const struct lo_key *la = a;
   4287     const struct lo_key *lb = b;
   4288 
   4289     return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
   4290 }
   4291 
   4292 static void fuse_lo_data_cleanup(struct lo_data *lo)
   4293 {
   4294     if (lo->inodes) {
   4295         g_hash_table_destroy(lo->inodes);
   4296     }
   4297 
   4298     if (lo->root.posix_locks) {
   4299         g_hash_table_destroy(lo->root.posix_locks);
   4300     }
   4301     lo_map_destroy(&lo->fd_map);
   4302     lo_map_destroy(&lo->dirp_map);
   4303     lo_map_destroy(&lo->ino_map);
   4304 
   4305     if (lo->proc_self_fd >= 0) {
   4306         close(lo->proc_self_fd);
   4307     }
   4308 
   4309     if (lo->proc_self_task >= 0) {
   4310         close(lo->proc_self_task);
   4311     }
   4312 
   4313     if (lo->root.fd >= 0) {
   4314         close(lo->root.fd);
   4315     }
   4316 
   4317     free(lo->xattrmap);
   4318     free_xattrmap(lo);
   4319     free(lo->xattr_security_capability);
   4320     free(lo->source);
   4321 }
   4322 
   4323 static void qemu_version(void)
   4324 {
   4325     printf("virtiofsd version " QEMU_FULL_VERSION "\n" QEMU_COPYRIGHT "\n");
   4326 }
   4327 
   4328 int main(int argc, char *argv[])
   4329 {
   4330     struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
   4331     struct fuse_session *se;
   4332     struct fuse_cmdline_opts opts;
   4333     struct lo_data lo = {
   4334         .sandbox = SANDBOX_NAMESPACE,
   4335         .debug = 0,
   4336         .writeback = 0,
   4337         .posix_lock = 0,
   4338         .allow_direct_io = 0,
   4339         .proc_self_fd = -1,
   4340         .proc_self_task = -1,
   4341         .user_killpriv_v2 = -1,
   4342         .user_posix_acl = -1,
   4343         .user_security_label = -1,
   4344     };
   4345     struct lo_map_elem *root_elem;
   4346     struct lo_map_elem *reserve_elem;
   4347     int ret = -1;
   4348 
   4349     /* Initialize time conversion information for localtime_r(). */
   4350     tzset();
   4351 
   4352     /* Don't mask creation mode, kernel already did that */
   4353     umask(0);
   4354 
   4355     qemu_init_exec_dir(argv[0]);
   4356 
   4357     drop_supplementary_groups();
   4358 
   4359     pthread_mutex_init(&lo.mutex, NULL);
   4360     lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
   4361     lo.root.fd = -1;
   4362     lo.root.fuse_ino = FUSE_ROOT_ID;
   4363     lo.cache = CACHE_AUTO;
   4364 
   4365     /*
   4366      * Set up the ino map like this:
   4367      * [0] Reserved (will not be used)
   4368      * [1] Root inode
   4369      */
   4370     lo_map_init(&lo.ino_map);
   4371     reserve_elem = lo_map_reserve(&lo.ino_map, 0);
   4372     if (!reserve_elem) {
   4373         fuse_log(FUSE_LOG_ERR, "failed to alloc reserve_elem.\n");
   4374         goto err_out1;
   4375     }
   4376     reserve_elem->in_use = false;
   4377     root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
   4378     if (!root_elem) {
   4379         fuse_log(FUSE_LOG_ERR, "failed to alloc root_elem.\n");
   4380         goto err_out1;
   4381     }
   4382     root_elem->inode = &lo.root;
   4383 
   4384     lo_map_init(&lo.dirp_map);
   4385     lo_map_init(&lo.fd_map);
   4386 
   4387     if (fuse_parse_cmdline(&args, &opts) != 0) {
   4388         goto err_out1;
   4389     }
   4390     fuse_set_log_func(log_func);
   4391     use_syslog = opts.syslog;
   4392     if (use_syslog) {
   4393         openlog("virtiofsd", LOG_PID, LOG_DAEMON);
   4394     }
   4395 
   4396     if (opts.show_help) {
   4397         printf("usage: %s [options]\n\n", argv[0]);
   4398         fuse_cmdline_help();
   4399         printf("    -o source=PATH             shared directory tree\n");
   4400         fuse_lowlevel_help();
   4401         ret = 0;
   4402         goto err_out1;
   4403     } else if (opts.show_version) {
   4404         qemu_version();
   4405         fuse_lowlevel_version();
   4406         ret = 0;
   4407         goto err_out1;
   4408     } else if (opts.print_capabilities) {
   4409         print_capabilities();
   4410         ret = 0;
   4411         goto err_out1;
   4412     }
   4413 
   4414     if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
   4415         goto err_out1;
   4416     }
   4417 
   4418     if (opts.log_level != 0) {
   4419         current_log_level = opts.log_level;
   4420     } else {
   4421         /* default log level is INFO */
   4422         current_log_level = FUSE_LOG_INFO;
   4423     }
   4424     lo.debug = opts.debug;
   4425     if (lo.debug) {
   4426         current_log_level = FUSE_LOG_DEBUG;
   4427     }
   4428     if (lo.source) {
   4429         struct stat stat;
   4430         int res;
   4431 
   4432         res = lstat(lo.source, &stat);
   4433         if (res == -1) {
   4434             fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
   4435                      lo.source);
   4436             exit(1);
   4437         }
   4438         if (!S_ISDIR(stat.st_mode)) {
   4439             fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
   4440             exit(1);
   4441         }
   4442     } else {
   4443         lo.source = strdup("/");
   4444         if (!lo.source) {
   4445             fuse_log(FUSE_LOG_ERR, "failed to strdup source\n");
   4446             goto err_out1;
   4447         }
   4448     }
   4449 
   4450     if (lo.xattrmap) {
   4451         lo.xattr = 1;
   4452         parse_xattrmap(&lo);
   4453     }
   4454 
   4455     if (!lo.timeout_set) {
   4456         switch (lo.cache) {
   4457         case CACHE_NONE:
   4458             lo.timeout = 0.0;
   4459             break;
   4460 
   4461         case CACHE_AUTO:
   4462             lo.timeout = 1.0;
   4463             break;
   4464 
   4465         case CACHE_ALWAYS:
   4466             lo.timeout = 86400.0;
   4467             break;
   4468         }
   4469     } else if (lo.timeout < 0) {
   4470         fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
   4471         exit(1);
   4472     }
   4473 
   4474     if (lo.user_posix_acl == 1 && !lo.xattr) {
   4475         fuse_log(FUSE_LOG_ERR, "Can't enable posix ACLs. xattrs are disabled."
   4476                  "\n");
   4477         exit(1);
   4478     }
   4479 
   4480     lo.use_statx = true;
   4481 
   4482     se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
   4483     if (se == NULL) {
   4484         goto err_out1;
   4485     }
   4486 
   4487     if (fuse_set_signal_handlers(se) != 0) {
   4488         goto err_out2;
   4489     }
   4490 
   4491     if (fuse_session_mount(se) != 0) {
   4492         goto err_out3;
   4493     }
   4494 
   4495     fuse_daemonize(opts.foreground);
   4496 
   4497     setup_nofile_rlimit(opts.rlimit_nofile);
   4498 
   4499     /* Must be before sandbox since it wants /proc */
   4500     setup_capng();
   4501 
   4502     setup_sandbox(&lo, se, opts.syslog);
   4503 
   4504     setup_root(&lo, &lo.root);
   4505     /* Block until ctrl+c or fusermount -u */
   4506     ret = virtio_loop(se);
   4507 
   4508     fuse_session_unmount(se);
   4509     cleanup_capng();
   4510 err_out3:
   4511     fuse_remove_signal_handlers(se);
   4512 err_out2:
   4513     fuse_session_destroy(se);
   4514 err_out1:
   4515     fuse_opt_free_args(&args);
   4516 
   4517     fuse_lo_data_cleanup(&lo);
   4518 
   4519     return ret ? 1 : 0;
   4520 }