qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

userfaultfd.c (9437B)


      1 /*
      2  * Linux UFFD-WP support
      3  *
      4  * Copyright Virtuozzo GmbH, 2020
      5  *
      6  * Authors:
      7  *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
      8  *
      9  * This work is licensed under the terms of the GNU GPL, version 2 or
     10  * later.  See the COPYING file in the top-level directory.
     11  */
     12 
     13 #include "qemu/osdep.h"
     14 #include "qemu/bitops.h"
     15 #include "qemu/error-report.h"
     16 #include "qemu/userfaultfd.h"
     17 #include "trace.h"
     18 #include <poll.h>
     19 #include <sys/syscall.h>
     20 #include <sys/ioctl.h>
     21 
     22 /**
     23  * uffd_query_features: query UFFD features
     24  *
     25  * Returns: 0 on success, negative value in case of an error
     26  *
     27  * @features: parameter to receive 'uffdio_api.features'
     28  */
     29 int uffd_query_features(uint64_t *features)
     30 {
     31     int uffd_fd;
     32     struct uffdio_api api_struct = { 0 };
     33     int ret = -1;
     34 
     35     uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC);
     36     if (uffd_fd < 0) {
     37         trace_uffd_query_features_nosys(errno);
     38         return -1;
     39     }
     40 
     41     api_struct.api = UFFD_API;
     42     api_struct.features = 0;
     43 
     44     if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
     45         trace_uffd_query_features_api_failed(errno);
     46         goto out;
     47     }
     48     *features = api_struct.features;
     49     ret = 0;
     50 
     51 out:
     52     close(uffd_fd);
     53     return ret;
     54 }
     55 
     56 /**
     57  * uffd_create_fd: create UFFD file descriptor
     58  *
     59  * Returns non-negative file descriptor or negative value in case of an error
     60  *
     61  * @features: UFFD features to request
     62  * @non_blocking: create UFFD file descriptor for non-blocking operation
     63  */
     64 int uffd_create_fd(uint64_t features, bool non_blocking)
     65 {
     66     int uffd_fd;
     67     int flags;
     68     struct uffdio_api api_struct = { 0 };
     69     uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
     70 
     71     flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
     72     uffd_fd = syscall(__NR_userfaultfd, flags);
     73     if (uffd_fd < 0) {
     74         trace_uffd_create_fd_nosys(errno);
     75         return -1;
     76     }
     77 
     78     api_struct.api = UFFD_API;
     79     api_struct.features = features;
     80     if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
     81         trace_uffd_create_fd_api_failed(errno);
     82         goto fail;
     83     }
     84     if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
     85         trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
     86         goto fail;
     87     }
     88 
     89     return uffd_fd;
     90 
     91 fail:
     92     close(uffd_fd);
     93     return -1;
     94 }
     95 
     96 /**
     97  * uffd_close_fd: close UFFD file descriptor
     98  *
     99  * @uffd_fd: UFFD file descriptor
    100  */
    101 void uffd_close_fd(int uffd_fd)
    102 {
    103     assert(uffd_fd >= 0);
    104     close(uffd_fd);
    105 }
    106 
    107 /**
    108  * uffd_register_memory: register memory range via UFFD-IO
    109  *
    110  * Returns 0 in case of success, negative value in case of an error
    111  *
    112  * @uffd_fd: UFFD file descriptor
    113  * @addr: base address of memory range
    114  * @length: length of memory range
    115  * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
    116  * @ioctls: optional pointer to receive supported IOCTL mask
    117  */
    118 int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
    119         uint64_t mode, uint64_t *ioctls)
    120 {
    121     struct uffdio_register uffd_register;
    122 
    123     uffd_register.range.start = (uintptr_t) addr;
    124     uffd_register.range.len = length;
    125     uffd_register.mode = mode;
    126 
    127     if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
    128         trace_uffd_register_memory_failed(addr, length, mode, errno);
    129         return -1;
    130     }
    131     if (ioctls) {
    132         *ioctls = uffd_register.ioctls;
    133     }
    134 
    135     return 0;
    136 }
    137 
    138 /**
    139  * uffd_unregister_memory: un-register memory range with UFFD-IO
    140  *
    141  * Returns 0 in case of success, negative value in case of an error
    142  *
    143  * @uffd_fd: UFFD file descriptor
    144  * @addr: base address of memory range
    145  * @length: length of memory range
    146  */
    147 int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
    148 {
    149     struct uffdio_range uffd_range;
    150 
    151     uffd_range.start = (uintptr_t) addr;
    152     uffd_range.len = length;
    153 
    154     if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
    155         trace_uffd_unregister_memory_failed(addr, length, errno);
    156         return -1;
    157     }
    158 
    159     return 0;
    160 }
    161 
    162 /**
    163  * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
    164  *
    165  * Returns 0 on success, negative value in case of error
    166  *
    167  * @uffd_fd: UFFD file descriptor
    168  * @addr: base address of memory range
    169  * @length: length of memory range
    170  * @wp: write-protect/unprotect
    171  * @dont_wake: do not wake threads waiting on wr-protected page
    172  */
    173 int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
    174         bool wp, bool dont_wake)
    175 {
    176     struct uffdio_writeprotect uffd_writeprotect;
    177 
    178     uffd_writeprotect.range.start = (uintptr_t) addr;
    179     uffd_writeprotect.range.len = length;
    180     if (!wp && dont_wake) {
    181         /* DONTWAKE is meaningful only on protection release */
    182         uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
    183     } else {
    184         uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
    185     }
    186 
    187     if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
    188         error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
    189                 " mode=%" PRIx64 " errno=%i", addr, length,
    190                 (uint64_t) uffd_writeprotect.mode, errno);
    191         return -1;
    192     }
    193 
    194     return 0;
    195 }
    196 
    197 /**
    198  * uffd_copy_page: copy range of pages to destination via UFFD-IO
    199  *
    200  * Copy range of source pages to the destination to resolve
    201  * missing page fault somewhere in the destination range.
    202  *
    203  * Returns 0 on success, negative value in case of an error
    204  *
    205  * @uffd_fd: UFFD file descriptor
    206  * @dst_addr: destination base address
    207  * @src_addr: source base address
    208  * @length: length of the range to copy
    209  * @dont_wake: do not wake threads waiting on missing page
    210  */
    211 int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
    212         uint64_t length, bool dont_wake)
    213 {
    214     struct uffdio_copy uffd_copy;
    215 
    216     uffd_copy.dst = (uintptr_t) dst_addr;
    217     uffd_copy.src = (uintptr_t) src_addr;
    218     uffd_copy.len = length;
    219     uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
    220 
    221     if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
    222         error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
    223                 " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
    224                 length, (uint64_t) uffd_copy.mode, errno);
    225         return -1;
    226     }
    227 
    228     return 0;
    229 }
    230 
    231 /**
    232  * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
    233  *
    234  * Fill range pages with zeroes to resolve missing page fault within the range.
    235  *
    236  * Returns 0 on success, negative value in case of an error
    237  *
    238  * @uffd_fd: UFFD file descriptor
    239  * @addr: base address
    240  * @length: length of the range to fill with zeroes
    241  * @dont_wake: do not wake threads waiting on missing page
    242  */
    243 int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
    244 {
    245     struct uffdio_zeropage uffd_zeropage;
    246 
    247     uffd_zeropage.range.start = (uintptr_t) addr;
    248     uffd_zeropage.range.len = length;
    249     uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
    250 
    251     if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
    252         error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
    253                 " mode=%" PRIx64 " errno=%i", addr, length,
    254                 (uint64_t) uffd_zeropage.mode, errno);
    255         return -1;
    256     }
    257 
    258     return 0;
    259 }
    260 
    261 /**
    262  * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
    263  *
    264  * Wake up threads waiting on any page/pages from the designated range.
    265  * The main use case is when during some period, page faults are resolved
    266  * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
    267  * for the whole memory range are satisfied in a single call to uffd_wakeup().
    268  *
    269  * Returns 0 on success, negative value in case of an error
    270  *
    271  * @uffd_fd: UFFD file descriptor
    272  * @addr: base address
    273  * @length: length of the range
    274  */
    275 int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
    276 {
    277     struct uffdio_range uffd_range;
    278 
    279     uffd_range.start = (uintptr_t) addr;
    280     uffd_range.len = length;
    281 
    282     if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
    283         error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
    284                 addr, length, errno);
    285         return -1;
    286     }
    287 
    288     return 0;
    289 }
    290 
    291 /**
    292  * uffd_read_events: read pending UFFD events
    293  *
    294  * Returns number of fetched messages, 0 if non is available or
    295  * negative value in case of an error
    296  *
    297  * @uffd_fd: UFFD file descriptor
    298  * @msgs: pointer to message buffer
    299  * @count: number of messages that can fit in the buffer
    300  */
    301 int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
    302 {
    303     ssize_t res;
    304     do {
    305         res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
    306     } while (res < 0 && errno == EINTR);
    307 
    308     if ((res < 0 && errno == EAGAIN)) {
    309         return 0;
    310     }
    311     if (res < 0) {
    312         error_report("uffd_read_events() failed: errno=%i", errno);
    313         return -1;
    314     }
    315 
    316     return (int) (res / sizeof(struct uffd_msg));
    317 }
    318 
    319 /**
    320  * uffd_poll_events: poll UFFD file descriptor for read
    321  *
    322  * Returns true if events are available for read, false otherwise
    323  *
    324  * @uffd_fd: UFFD file descriptor
    325  * @tmo: timeout value
    326  */
    327 bool uffd_poll_events(int uffd_fd, int tmo)
    328 {
    329     int res;
    330     struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
    331 
    332     do {
    333         res = poll(&poll_fd, 1, tmo);
    334     } while (res < 0 && errno == EINTR);
    335 
    336     if (res == 0) {
    337         return false;
    338     }
    339     if (res < 0) {
    340         error_report("uffd_poll_events() failed: errno=%i", errno);
    341         return false;
    342     }
    343 
    344     return (poll_fd.revents & POLLIN) != 0;
    345 }