qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

fdmon-io_uring.c (11476B)


      1 /* SPDX-License-Identifier: GPL-2.0-or-later */
      2 /*
      3  * Linux io_uring file descriptor monitoring
      4  *
      5  * The Linux io_uring API supports file descriptor monitoring with a few
      6  * advantages over existing APIs like poll(2) and epoll(7):
      7  *
      8  * 1. Userspace polling of events is possible because the completion queue (cq
      9  *    ring) is shared between the kernel and userspace.  This allows
     10  *    applications that rely on userspace polling to also monitor file
     11  *    descriptors in the same userspace polling loop.
     12  *
     13  * 2. Submission and completion is batched and done together in a single system
     14  *    call.  This minimizes the number of system calls.
     15  *
     16  * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than
     17  *    poll(2).
     18  *
     19  * 4. Nanosecond timeouts are supported so it requires fewer syscalls than
     20  *    epoll(7).
     21  *
     22  * This code only monitors file descriptors and does not do asynchronous disk
     23  * I/O.  Implementing disk I/O efficiently has other requirements and should
     24  * use a separate io_uring so it does not make sense to unify the code.
     25  *
     26  * File descriptor monitoring is implemented using the following operations:
     27  *
     28  * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored.
     29  * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored.  When
     30  *    the poll mask changes for a file descriptor it is first removed and then
     31  *    re-added with the new poll mask, so this operation is also used as part
     32  *    of modifying an existing monitored file descriptor.
     33  * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait
     34  *    for events.  This operation self-cancels if another event completes
     35  *    before the timeout.
     36  *
     37  * io_uring calls the submission queue the "sq ring" and the completion queue
     38  * the "cq ring".  Ring entries are called "sqe" and "cqe", respectively.
     39  *
     40  * The code is structured so that sq/cq rings are only modified within
     41  * fdmon_io_uring_wait().  Changes to AioHandlers are made by enqueuing them on
     42  * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD
     43  * and/or IORING_OP_POLL_REMOVE sqes for them.
     44  */
     45 
     46 #include "qemu/osdep.h"
     47 #include <poll.h>
     48 #include "qemu/rcu_queue.h"
     49 #include "aio-posix.h"
     50 
     51 enum {
     52     FDMON_IO_URING_ENTRIES  = 128, /* sq/cq ring size */
     53 
     54     /* AioHandler::flags */
     55     FDMON_IO_URING_PENDING  = (1 << 0),
     56     FDMON_IO_URING_ADD      = (1 << 1),
     57     FDMON_IO_URING_REMOVE   = (1 << 2),
     58 };
     59 
     60 static inline int poll_events_from_pfd(int pfd_events)
     61 {
     62     return (pfd_events & G_IO_IN ? POLLIN : 0) |
     63            (pfd_events & G_IO_OUT ? POLLOUT : 0) |
     64            (pfd_events & G_IO_HUP ? POLLHUP : 0) |
     65            (pfd_events & G_IO_ERR ? POLLERR : 0);
     66 }
     67 
     68 static inline int pfd_events_from_poll(int poll_events)
     69 {
     70     return (poll_events & POLLIN ? G_IO_IN : 0) |
     71            (poll_events & POLLOUT ? G_IO_OUT : 0) |
     72            (poll_events & POLLHUP ? G_IO_HUP : 0) |
     73            (poll_events & POLLERR ? G_IO_ERR : 0);
     74 }
     75 
     76 /*
     77  * Returns an sqe for submitting a request.  Only be called within
     78  * fdmon_io_uring_wait().
     79  */
     80 static struct io_uring_sqe *get_sqe(AioContext *ctx)
     81 {
     82     struct io_uring *ring = &ctx->fdmon_io_uring;
     83     struct io_uring_sqe *sqe = io_uring_get_sqe(ring);
     84     int ret;
     85 
     86     if (likely(sqe)) {
     87         return sqe;
     88     }
     89 
     90     /* No free sqes left, submit pending sqes first */
     91     do {
     92         ret = io_uring_submit(ring);
     93     } while (ret == -EINTR);
     94 
     95     assert(ret > 1);
     96     sqe = io_uring_get_sqe(ring);
     97     assert(sqe);
     98     return sqe;
     99 }
    100 
    101 /* Atomically enqueue an AioHandler for sq ring submission */
    102 static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags)
    103 {
    104     unsigned old_flags;
    105 
    106     old_flags = qatomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags);
    107     if (!(old_flags & FDMON_IO_URING_PENDING)) {
    108         QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted);
    109     }
    110 }
    111 
    112 /* Dequeue an AioHandler for sq ring submission.  Called by fill_sq_ring(). */
    113 static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags)
    114 {
    115     AioHandler *node = QSLIST_FIRST(head);
    116 
    117     if (!node) {
    118         return NULL;
    119     }
    120 
    121     /* Doesn't need to be atomic since fill_sq_ring() moves the list */
    122     QSLIST_REMOVE_HEAD(head, node_submitted);
    123 
    124     /*
    125      * Don't clear FDMON_IO_URING_REMOVE.  It's sticky so it can serve two
    126      * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and
    127      * telling process_cqe() to delete the AioHandler when its
    128      * IORING_OP_POLL_ADD completes.
    129      */
    130     *flags = qatomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING |
    131                                               FDMON_IO_URING_ADD));
    132     return node;
    133 }
    134 
    135 static void fdmon_io_uring_update(AioContext *ctx,
    136                                   AioHandler *old_node,
    137                                   AioHandler *new_node)
    138 {
    139     if (new_node) {
    140         enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD);
    141     }
    142 
    143     if (old_node) {
    144         /*
    145          * Deletion is tricky because IORING_OP_POLL_ADD and
    146          * IORING_OP_POLL_REMOVE are async.  We need to wait for the original
    147          * IORING_OP_POLL_ADD to complete before this handler can be freed
    148          * safely.
    149          *
    150          * It's possible that the file descriptor becomes ready and the
    151          * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is
    152          * submitted, too.
    153          *
    154          * Mark this handler deleted right now but don't place it on
    155          * ctx->deleted_aio_handlers yet.  Instead, manually fudge the list
    156          * entry to make QLIST_IS_INSERTED() think this handler has been
    157          * inserted and other code recognizes this AioHandler as deleted.
    158          *
    159          * Once the original IORING_OP_POLL_ADD completes we enqueue the
    160          * handler on the real ctx->deleted_aio_handlers list to be freed.
    161          */
    162         assert(!QLIST_IS_INSERTED(old_node, node_deleted));
    163         old_node->node_deleted.le_prev = &old_node->node_deleted.le_next;
    164 
    165         enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE);
    166     }
    167 }
    168 
    169 static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
    170 {
    171     struct io_uring_sqe *sqe = get_sqe(ctx);
    172     int events = poll_events_from_pfd(node->pfd.events);
    173 
    174     io_uring_prep_poll_add(sqe, node->pfd.fd, events);
    175     io_uring_sqe_set_data(sqe, node);
    176 }
    177 
    178 static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
    179 {
    180     struct io_uring_sqe *sqe = get_sqe(ctx);
    181 
    182 #ifdef LIBURING_HAVE_DATA64
    183     io_uring_prep_poll_remove(sqe, (__u64)(uintptr_t)node);
    184 #else
    185     io_uring_prep_poll_remove(sqe, node);
    186 #endif
    187 }
    188 
    189 /* Add a timeout that self-cancels when another cqe becomes ready */
    190 static void add_timeout_sqe(AioContext *ctx, int64_t ns)
    191 {
    192     struct io_uring_sqe *sqe;
    193     struct __kernel_timespec ts = {
    194         .tv_sec = ns / NANOSECONDS_PER_SECOND,
    195         .tv_nsec = ns % NANOSECONDS_PER_SECOND,
    196     };
    197 
    198     sqe = get_sqe(ctx);
    199     io_uring_prep_timeout(sqe, &ts, 1, 0);
    200 }
    201 
    202 /* Add sqes from ctx->submit_list for submission */
    203 static void fill_sq_ring(AioContext *ctx)
    204 {
    205     AioHandlerSList submit_list;
    206     AioHandler *node;
    207     unsigned flags;
    208 
    209     QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list);
    210 
    211     while ((node = dequeue(&submit_list, &flags))) {
    212         /* Order matters, just in case both flags were set */
    213         if (flags & FDMON_IO_URING_ADD) {
    214             add_poll_add_sqe(ctx, node);
    215         }
    216         if (flags & FDMON_IO_URING_REMOVE) {
    217             add_poll_remove_sqe(ctx, node);
    218         }
    219     }
    220 }
    221 
    222 /* Returns true if a handler became ready */
    223 static bool process_cqe(AioContext *ctx,
    224                         AioHandlerList *ready_list,
    225                         struct io_uring_cqe *cqe)
    226 {
    227     AioHandler *node = io_uring_cqe_get_data(cqe);
    228     unsigned flags;
    229 
    230     /* poll_timeout and poll_remove have a zero user_data field */
    231     if (!node) {
    232         return false;
    233     }
    234 
    235     /*
    236      * Deletion can only happen when IORING_OP_POLL_ADD completes.  If we race
    237      * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
    238      * bit before IORING_OP_POLL_REMOVE is submitted.
    239      */
    240     flags = qatomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
    241     if (flags & FDMON_IO_URING_REMOVE) {
    242         QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
    243         return false;
    244     }
    245 
    246     aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res));
    247 
    248     /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */
    249     add_poll_add_sqe(ctx, node);
    250     return true;
    251 }
    252 
    253 static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
    254 {
    255     struct io_uring *ring = &ctx->fdmon_io_uring;
    256     struct io_uring_cqe *cqe;
    257     unsigned num_cqes = 0;
    258     unsigned num_ready = 0;
    259     unsigned head;
    260 
    261     io_uring_for_each_cqe(ring, head, cqe) {
    262         if (process_cqe(ctx, ready_list, cqe)) {
    263             num_ready++;
    264         }
    265 
    266         num_cqes++;
    267     }
    268 
    269     io_uring_cq_advance(ring, num_cqes);
    270     return num_ready;
    271 }
    272 
    273 static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
    274                                int64_t timeout)
    275 {
    276     unsigned wait_nr = 1; /* block until at least one cqe is ready */
    277     int ret;
    278 
    279     /* Fall back while external clients are disabled */
    280     if (qatomic_read(&ctx->external_disable_cnt)) {
    281         return fdmon_poll_ops.wait(ctx, ready_list, timeout);
    282     }
    283 
    284     if (timeout == 0) {
    285         wait_nr = 0; /* non-blocking */
    286     } else if (timeout > 0) {
    287         add_timeout_sqe(ctx, timeout);
    288     }
    289 
    290     fill_sq_ring(ctx);
    291 
    292     do {
    293         ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
    294     } while (ret == -EINTR);
    295 
    296     assert(ret >= 0);
    297 
    298     return process_cq_ring(ctx, ready_list);
    299 }
    300 
    301 static bool fdmon_io_uring_need_wait(AioContext *ctx)
    302 {
    303     /* Have io_uring events completed? */
    304     if (io_uring_cq_ready(&ctx->fdmon_io_uring)) {
    305         return true;
    306     }
    307 
    308     /* Are there pending sqes to submit? */
    309     if (io_uring_sq_ready(&ctx->fdmon_io_uring)) {
    310         return true;
    311     }
    312 
    313     /* Do we need to process AioHandlers for io_uring changes? */
    314     if (!QSLIST_EMPTY_RCU(&ctx->submit_list)) {
    315         return true;
    316     }
    317 
    318     /* Are we falling back to fdmon-poll? */
    319     return qatomic_read(&ctx->external_disable_cnt);
    320 }
    321 
    322 static const FDMonOps fdmon_io_uring_ops = {
    323     .update = fdmon_io_uring_update,
    324     .wait = fdmon_io_uring_wait,
    325     .need_wait = fdmon_io_uring_need_wait,
    326 };
    327 
    328 bool fdmon_io_uring_setup(AioContext *ctx)
    329 {
    330     int ret;
    331 
    332     ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
    333     if (ret != 0) {
    334         return false;
    335     }
    336 
    337     QSLIST_INIT(&ctx->submit_list);
    338     ctx->fdmon_ops = &fdmon_io_uring_ops;
    339     return true;
    340 }
    341 
    342 void fdmon_io_uring_destroy(AioContext *ctx)
    343 {
    344     if (ctx->fdmon_ops == &fdmon_io_uring_ops) {
    345         AioHandler *node;
    346 
    347         io_uring_queue_exit(&ctx->fdmon_io_uring);
    348 
    349         /* Move handlers due to be removed onto the deleted list */
    350         while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
    351             unsigned flags = qatomic_fetch_and(&node->flags,
    352                     ~(FDMON_IO_URING_PENDING |
    353                       FDMON_IO_URING_ADD |
    354                       FDMON_IO_URING_REMOVE));
    355 
    356             if (flags & FDMON_IO_URING_REMOVE) {
    357                 QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
    358             }
    359 
    360             QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
    361         }
    362 
    363         ctx->fdmon_ops = &fdmon_poll_ops;
    364     }
    365 }