qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

blkio.c (30594B)


      1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
      2 /*
      3  * libblkio BlockDriver
      4  *
      5  * Copyright Red Hat, Inc.
      6  *
      7  * Author:
      8  *   Stefan Hajnoczi <stefanha@redhat.com>
      9  */
     10 
     11 #include "qemu/osdep.h"
     12 #include <blkio.h>
     13 #include "block/block_int.h"
     14 #include "exec/memory.h"
     15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */
     16 #include "qapi/error.h"
     17 #include "qemu/error-report.h"
     18 #include "qapi/qmp/qdict.h"
     19 #include "qemu/module.h"
     20 #include "exec/memory.h" /* for ram_block_discard_disable() */
     21 
     22 /*
     23  * Keep the QEMU BlockDriver names identical to the libblkio driver names.
     24  * Using macros instead of typing out the string literals avoids typos.
     25  */
     26 #define DRIVER_IO_URING "io_uring"
     27 #define DRIVER_NVME_IO_URING "nvme-io_uring"
     28 #define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci"
     29 #define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user"
     30 #define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa"
     31 
     32 /*
     33  * Allocated bounce buffers are kept in a list sorted by buffer address.
     34  */
     35 typedef struct BlkioBounceBuf {
     36     QLIST_ENTRY(BlkioBounceBuf) next;
     37 
     38     /* The bounce buffer */
     39     struct iovec buf;
     40 } BlkioBounceBuf;
     41 
     42 typedef struct {
     43     /*
     44      * libblkio is not thread-safe so this lock protects ->blkio and
     45      * ->blkioq.
     46      */
     47     QemuMutex blkio_lock;
     48     struct blkio *blkio;
     49     struct blkioq *blkioq; /* make this multi-queue in the future... */
     50     int completion_fd;
     51 
     52     /*
     53      * Polling fetches the next completion into this field.
     54      *
     55      * No lock is necessary since only one thread calls aio_poll() and invokes
     56      * fd and poll handlers.
     57      */
     58     struct blkio_completion poll_completion;
     59 
     60     /*
     61      * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available.
     62      *
     63      * Lock ordering: ->bounce_lock before ->blkio_lock.
     64      */
     65     CoMutex bounce_lock;
     66 
     67     /* Bounce buffer pool */
     68     struct blkio_mem_region bounce_pool;
     69 
     70     /* Sorted list of allocated bounce buffers */
     71     QLIST_HEAD(, BlkioBounceBuf) bounce_bufs;
     72 
     73     /* Queue for coroutines waiting for bounce buffer space */
     74     CoQueue bounce_available;
     75 
     76     /* The value of the "mem-region-alignment" property */
     77     size_t mem_region_alignment;
     78 
     79     /* Can we skip adding/deleting blkio_mem_regions? */
     80     bool needs_mem_regions;
     81 
     82     /* Are file descriptors necessary for blkio_mem_regions? */
     83     bool needs_mem_region_fd;
     84 
     85     /* Are madvise(MADV_DONTNEED)-style operations unavailable? */
     86     bool may_pin_mem_regions;
     87 } BDRVBlkioState;
     88 
     89 /* Called with s->bounce_lock held */
     90 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes)
     91 {
     92     /* There can be no allocated bounce buffers during resize */
     93     assert(QLIST_EMPTY(&s->bounce_bufs));
     94 
     95     /* Pad size to reduce frequency of resize calls */
     96     bytes += 128 * 1024;
     97 
     98     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
     99         int ret;
    100 
    101         if (s->bounce_pool.addr) {
    102             blkio_unmap_mem_region(s->blkio, &s->bounce_pool);
    103             blkio_free_mem_region(s->blkio, &s->bounce_pool);
    104             memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
    105         }
    106 
    107         /* Automatically freed when s->blkio is destroyed */
    108         ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes);
    109         if (ret < 0) {
    110             return ret;
    111         }
    112 
    113         ret = blkio_map_mem_region(s->blkio, &s->bounce_pool);
    114         if (ret < 0) {
    115             blkio_free_mem_region(s->blkio, &s->bounce_pool);
    116             memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
    117             return ret;
    118         }
    119     }
    120 
    121     return 0;
    122 }
    123 
    124 /* Called with s->bounce_lock held */
    125 static bool
    126 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
    127                              int64_t bytes)
    128 {
    129     void *addr = s->bounce_pool.addr;
    130     BlkioBounceBuf *cur = NULL;
    131     BlkioBounceBuf *prev = NULL;
    132     ptrdiff_t space;
    133 
    134     /*
    135      * This is just a linear search over the holes between requests. An
    136      * efficient allocator would be nice.
    137      */
    138     QLIST_FOREACH(cur, &s->bounce_bufs, next) {
    139         space = cur->buf.iov_base - addr;
    140         if (bytes <= space) {
    141             QLIST_INSERT_BEFORE(cur, bounce, next);
    142             bounce->buf.iov_base = addr;
    143             bounce->buf.iov_len = bytes;
    144             return true;
    145         }
    146 
    147         addr = cur->buf.iov_base + cur->buf.iov_len;
    148         prev = cur;
    149     }
    150 
    151     /* Is there space after the last request? */
    152     space = s->bounce_pool.addr + s->bounce_pool.len - addr;
    153     if (bytes > space) {
    154         return false;
    155     }
    156     if (prev) {
    157         QLIST_INSERT_AFTER(prev, bounce, next);
    158     } else {
    159         QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next);
    160     }
    161     bounce->buf.iov_base = addr;
    162     bounce->buf.iov_len = bytes;
    163     return true;
    164 }
    165 
    166 static int coroutine_fn
    167 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
    168                           int64_t bytes)
    169 {
    170     /*
    171      * Ensure fairness: first time around we join the back of the queue,
    172      * subsequently we join the front so we don't lose our place.
    173      */
    174     CoQueueWaitFlags wait_flags = 0;
    175 
    176     QEMU_LOCK_GUARD(&s->bounce_lock);
    177 
    178     /* Ensure fairness: don't even try if other requests are already waiting */
    179     if (!qemu_co_queue_empty(&s->bounce_available)) {
    180         qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
    181                                  wait_flags);
    182         wait_flags = CO_QUEUE_WAIT_FRONT;
    183     }
    184 
    185     while (true) {
    186         if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) {
    187             /* Kick the next queued request since there may be space */
    188             qemu_co_queue_next(&s->bounce_available);
    189             return 0;
    190         }
    191 
    192         /*
    193          * If there are no in-flight requests then the pool was simply too
    194          * small.
    195          */
    196         if (QLIST_EMPTY(&s->bounce_bufs)) {
    197             bool ok;
    198             int ret;
    199 
    200             ret = blkio_resize_bounce_pool(s, bytes);
    201             if (ret < 0) {
    202                 /* Kick the next queued request since that may fail too */
    203                 qemu_co_queue_next(&s->bounce_available);
    204                 return ret;
    205             }
    206 
    207             ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes);
    208             assert(ok); /* must have space this time */
    209             return 0;
    210         }
    211 
    212         qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
    213                                  wait_flags);
    214         wait_flags = CO_QUEUE_WAIT_FRONT;
    215     }
    216 }
    217 
    218 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s,
    219                                                   BlkioBounceBuf *bounce)
    220 {
    221     QEMU_LOCK_GUARD(&s->bounce_lock);
    222 
    223     QLIST_REMOVE(bounce, next);
    224 
    225     /* Wake up waiting coroutines since space may now be available */
    226     qemu_co_queue_next(&s->bounce_available);
    227 }
    228 
    229 /* For async to .bdrv_co_*() conversion */
    230 typedef struct {
    231     Coroutine *coroutine;
    232     int ret;
    233 } BlkioCoData;
    234 
    235 static void blkio_completion_fd_read(void *opaque)
    236 {
    237     BlockDriverState *bs = opaque;
    238     BDRVBlkioState *s = bs->opaque;
    239     uint64_t val;
    240     int ret;
    241 
    242     /* Polling may have already fetched a completion */
    243     if (s->poll_completion.user_data != NULL) {
    244         BlkioCoData *cod = s->poll_completion.user_data;
    245         cod->ret = s->poll_completion.ret;
    246 
    247         /* Clear it in case aio_co_wake() enters a nested event loop */
    248         s->poll_completion.user_data = NULL;
    249 
    250         aio_co_wake(cod->coroutine);
    251     }
    252 
    253     /* Reset completion fd status */
    254     ret = read(s->completion_fd, &val, sizeof(val));
    255 
    256     /* Ignore errors, there's nothing we can do */
    257     (void)ret;
    258 
    259     /*
    260      * Reading one completion at a time makes nested event loop re-entrancy
    261      * simple. Change this loop to get multiple completions in one go if it
    262      * becomes a performance bottleneck.
    263      */
    264     while (true) {
    265         struct blkio_completion completion;
    266 
    267         WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
    268             ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL);
    269         }
    270         if (ret != 1) {
    271             break;
    272         }
    273 
    274         BlkioCoData *cod = completion.user_data;
    275         cod->ret = completion.ret;
    276         aio_co_wake(cod->coroutine);
    277     }
    278 }
    279 
    280 static bool blkio_completion_fd_poll(void *opaque)
    281 {
    282     BlockDriverState *bs = opaque;
    283     BDRVBlkioState *s = bs->opaque;
    284     int ret;
    285 
    286     /* Just in case we already fetched a completion */
    287     if (s->poll_completion.user_data != NULL) {
    288         return true;
    289     }
    290 
    291     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
    292         ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL);
    293     }
    294     return ret == 1;
    295 }
    296 
    297 static void blkio_completion_fd_poll_ready(void *opaque)
    298 {
    299     blkio_completion_fd_read(opaque);
    300 }
    301 
    302 static void blkio_attach_aio_context(BlockDriverState *bs,
    303                                      AioContext *new_context)
    304 {
    305     BDRVBlkioState *s = bs->opaque;
    306 
    307     aio_set_fd_handler(new_context,
    308                        s->completion_fd,
    309                        false,
    310                        blkio_completion_fd_read,
    311                        NULL,
    312                        blkio_completion_fd_poll,
    313                        blkio_completion_fd_poll_ready,
    314                        bs);
    315 }
    316 
    317 static void blkio_detach_aio_context(BlockDriverState *bs)
    318 {
    319     BDRVBlkioState *s = bs->opaque;
    320 
    321     aio_set_fd_handler(bdrv_get_aio_context(bs),
    322                        s->completion_fd,
    323                        false, NULL, NULL, NULL, NULL, NULL);
    324 }
    325 
    326 /* Call with s->blkio_lock held to submit I/O after enqueuing a new request */
    327 static void blkio_submit_io(BlockDriverState *bs)
    328 {
    329     if (qatomic_read(&bs->io_plugged) == 0) {
    330         BDRVBlkioState *s = bs->opaque;
    331 
    332         blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
    333     }
    334 }
    335 
    336 static int coroutine_fn
    337 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
    338 {
    339     BDRVBlkioState *s = bs->opaque;
    340     BlkioCoData cod = {
    341         .coroutine = qemu_coroutine_self(),
    342     };
    343 
    344     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
    345         blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
    346         blkio_submit_io(bs);
    347     }
    348 
    349     qemu_coroutine_yield();
    350     return cod.ret;
    351 }
    352 
    353 static int coroutine_fn
    354 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
    355                 QEMUIOVector *qiov, BdrvRequestFlags flags)
    356 {
    357     BlkioCoData cod = {
    358         .coroutine = qemu_coroutine_self(),
    359     };
    360     BDRVBlkioState *s = bs->opaque;
    361     bool use_bounce_buffer =
    362         s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
    363     BlkioBounceBuf bounce;
    364     struct iovec *iov = qiov->iov;
    365     int iovcnt = qiov->niov;
    366 
    367     if (use_bounce_buffer) {
    368         int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
    369         if (ret < 0) {
    370             return ret;
    371         }
    372 
    373         iov = &bounce.buf;
    374         iovcnt = 1;
    375     }
    376 
    377     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
    378         blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
    379         blkio_submit_io(bs);
    380     }
    381 
    382     qemu_coroutine_yield();
    383 
    384     if (use_bounce_buffer) {
    385         if (cod.ret == 0) {
    386             qemu_iovec_from_buf(qiov, 0,
    387                                 bounce.buf.iov_base,
    388                                 bounce.buf.iov_len);
    389         }
    390 
    391         blkio_free_bounce_buffer(s, &bounce);
    392     }
    393 
    394     return cod.ret;
    395 }
    396 
    397 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
    398         int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
    399 {
    400     uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
    401     BlkioCoData cod = {
    402         .coroutine = qemu_coroutine_self(),
    403     };
    404     BDRVBlkioState *s = bs->opaque;
    405     bool use_bounce_buffer =
    406         s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
    407     BlkioBounceBuf bounce;
    408     struct iovec *iov = qiov->iov;
    409     int iovcnt = qiov->niov;
    410 
    411     if (use_bounce_buffer) {
    412         int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
    413         if (ret < 0) {
    414             return ret;
    415         }
    416 
    417         qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes);
    418         iov = &bounce.buf;
    419         iovcnt = 1;
    420     }
    421 
    422     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
    423         blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
    424         blkio_submit_io(bs);
    425     }
    426 
    427     qemu_coroutine_yield();
    428 
    429     if (use_bounce_buffer) {
    430         blkio_free_bounce_buffer(s, &bounce);
    431     }
    432 
    433     return cod.ret;
    434 }
    435 
    436 static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
    437 {
    438     BDRVBlkioState *s = bs->opaque;
    439     BlkioCoData cod = {
    440         .coroutine = qemu_coroutine_self(),
    441     };
    442 
    443     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
    444         blkioq_flush(s->blkioq, &cod, 0);
    445         blkio_submit_io(bs);
    446     }
    447 
    448     qemu_coroutine_yield();
    449     return cod.ret;
    450 }
    451 
    452 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
    453     int64_t offset, int64_t bytes, BdrvRequestFlags flags)
    454 {
    455     BDRVBlkioState *s = bs->opaque;
    456     BlkioCoData cod = {
    457         .coroutine = qemu_coroutine_self(),
    458     };
    459     uint32_t blkio_flags = 0;
    460 
    461     if (flags & BDRV_REQ_FUA) {
    462         blkio_flags |= BLKIO_REQ_FUA;
    463     }
    464     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
    465         blkio_flags |= BLKIO_REQ_NO_UNMAP;
    466     }
    467     if (flags & BDRV_REQ_NO_FALLBACK) {
    468         blkio_flags |= BLKIO_REQ_NO_FALLBACK;
    469     }
    470 
    471     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
    472         blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
    473         blkio_submit_io(bs);
    474     }
    475 
    476     qemu_coroutine_yield();
    477     return cod.ret;
    478 }
    479 
    480 static void blkio_io_unplug(BlockDriverState *bs)
    481 {
    482     BDRVBlkioState *s = bs->opaque;
    483 
    484     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
    485         blkio_submit_io(bs);
    486     }
    487 }
    488 
    489 typedef enum {
    490     BMRR_OK,
    491     BMRR_SKIP,
    492     BMRR_FAIL,
    493 } BlkioMemRegionResult;
    494 
    495 /*
    496  * Produce a struct blkio_mem_region for a given address and size.
    497  *
    498  * This function produces identical results when called multiple times with the
    499  * same arguments. This property is necessary because blkio_unmap_mem_region()
    500  * must receive the same struct blkio_mem_region field values that were passed
    501  * to blkio_map_mem_region().
    502  */
    503 static BlkioMemRegionResult
    504 blkio_mem_region_from_host(BlockDriverState *bs,
    505                            void *host, size_t size,
    506                            struct blkio_mem_region *region,
    507                            Error **errp)
    508 {
    509     BDRVBlkioState *s = bs->opaque;
    510     int fd = -1;
    511     ram_addr_t fd_offset = 0;
    512 
    513     if (((uintptr_t)host | size) % s->mem_region_alignment) {
    514         error_setg(errp, "unaligned buf %p with size %zu", host, size);
    515         return BMRR_FAIL;
    516     }
    517 
    518     /* Attempt to find the fd for the underlying memory */
    519     if (s->needs_mem_region_fd) {
    520         RAMBlock *ram_block;
    521         RAMBlock *end_block;
    522         ram_addr_t offset;
    523 
    524         /*
    525          * bdrv_register_buf() is called with the BQL held so mr lives at least
    526          * until this function returns.
    527          */
    528         ram_block = qemu_ram_block_from_host(host, false, &fd_offset);
    529         if (ram_block) {
    530             fd = qemu_ram_get_fd(ram_block);
    531         }
    532         if (fd == -1) {
    533             /*
    534              * Ideally every RAMBlock would have an fd. pc-bios and other
    535              * things don't. Luckily they are usually not I/O buffers and we
    536              * can just ignore them.
    537              */
    538             return BMRR_SKIP;
    539         }
    540 
    541         /* Make sure the fd covers the entire range */
    542         end_block = qemu_ram_block_from_host(host + size - 1, false, &offset);
    543         if (ram_block != end_block) {
    544             error_setg(errp, "registered buffer at %p with size %zu extends "
    545                        "beyond RAMBlock", host, size);
    546             return BMRR_FAIL;
    547         }
    548     }
    549 
    550     *region = (struct blkio_mem_region){
    551         .addr = host,
    552         .len = size,
    553         .fd = fd,
    554         .fd_offset = fd_offset,
    555     };
    556     return BMRR_OK;
    557 }
    558 
    559 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size,
    560                                Error **errp)
    561 {
    562     BDRVBlkioState *s = bs->opaque;
    563     struct blkio_mem_region region;
    564     BlkioMemRegionResult region_result;
    565     int ret;
    566 
    567     /*
    568      * Mapping memory regions conflicts with RAM discard (virtio-mem) when
    569      * there is pinning, so only do it when necessary.
    570      */
    571     if (!s->needs_mem_regions && s->may_pin_mem_regions) {
    572         return true;
    573     }
    574 
    575     region_result = blkio_mem_region_from_host(bs, host, size, &region, errp);
    576     if (region_result == BMRR_SKIP) {
    577         return true;
    578     } else if (region_result != BMRR_OK) {
    579         return false;
    580     }
    581 
    582     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
    583         ret = blkio_map_mem_region(s->blkio, &region);
    584     }
    585 
    586     if (ret < 0) {
    587         error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s",
    588                    host, size, blkio_get_error_msg());
    589         return false;
    590     }
    591     return true;
    592 }
    593 
    594 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
    595 {
    596     BDRVBlkioState *s = bs->opaque;
    597     struct blkio_mem_region region;
    598 
    599     /* See blkio_register_buf() */
    600     if (!s->needs_mem_regions && s->may_pin_mem_regions) {
    601         return;
    602     }
    603 
    604     if (blkio_mem_region_from_host(bs, host, size, &region, NULL) != BMRR_OK) {
    605         return;
    606     }
    607 
    608     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
    609         blkio_unmap_mem_region(s->blkio, &region);
    610     }
    611 }
    612 
    613 static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags,
    614                                Error **errp)
    615 {
    616     const char *filename = qdict_get_str(options, "filename");
    617     BDRVBlkioState *s = bs->opaque;
    618     int ret;
    619 
    620     ret = blkio_set_str(s->blkio, "path", filename);
    621     qdict_del(options, "filename");
    622     if (ret < 0) {
    623         error_setg_errno(errp, -ret, "failed to set path: %s",
    624                          blkio_get_error_msg());
    625         return ret;
    626     }
    627 
    628     if (flags & BDRV_O_NOCACHE) {
    629         ret = blkio_set_bool(s->blkio, "direct", true);
    630         if (ret < 0) {
    631             error_setg_errno(errp, -ret, "failed to set direct: %s",
    632                              blkio_get_error_msg());
    633             return ret;
    634         }
    635     }
    636 
    637     return 0;
    638 }
    639 
    640 static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags,
    641                                Error **errp)
    642 {
    643     const char *path = qdict_get_try_str(options, "path");
    644     BDRVBlkioState *s = bs->opaque;
    645     int ret;
    646 
    647     if (!path) {
    648         error_setg(errp, "missing 'path' option");
    649         return -EINVAL;
    650     }
    651 
    652     ret = blkio_set_str(s->blkio, "path", path);
    653     qdict_del(options, "path");
    654     if (ret < 0) {
    655         error_setg_errno(errp, -ret, "failed to set path: %s",
    656                          blkio_get_error_msg());
    657         return ret;
    658     }
    659 
    660     if (!(flags & BDRV_O_NOCACHE)) {
    661         error_setg(errp, "cache.direct=off is not supported");
    662         return -EINVAL;
    663     }
    664 
    665     return 0;
    666 }
    667 
    668 static int blkio_virtio_blk_common_open(BlockDriverState *bs,
    669         QDict *options, int flags, Error **errp)
    670 {
    671     const char *path = qdict_get_try_str(options, "path");
    672     BDRVBlkioState *s = bs->opaque;
    673     int ret;
    674 
    675     if (!path) {
    676         error_setg(errp, "missing 'path' option");
    677         return -EINVAL;
    678     }
    679 
    680     ret = blkio_set_str(s->blkio, "path", path);
    681     qdict_del(options, "path");
    682     if (ret < 0) {
    683         error_setg_errno(errp, -ret, "failed to set path: %s",
    684                          blkio_get_error_msg());
    685         return ret;
    686     }
    687 
    688     if (!(flags & BDRV_O_NOCACHE)) {
    689         error_setg(errp, "cache.direct=off is not supported");
    690         return -EINVAL;
    691     }
    692     return 0;
    693 }
    694 
    695 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
    696                            Error **errp)
    697 {
    698     const char *blkio_driver = bs->drv->protocol_name;
    699     BDRVBlkioState *s = bs->opaque;
    700     int ret;
    701 
    702     ret = blkio_create(blkio_driver, &s->blkio);
    703     if (ret < 0) {
    704         error_setg_errno(errp, -ret, "blkio_create failed: %s",
    705                          blkio_get_error_msg());
    706         return ret;
    707     }
    708 
    709     if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) {
    710         ret = blkio_io_uring_open(bs, options, flags, errp);
    711     } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) {
    712         ret = blkio_nvme_io_uring(bs, options, flags, errp);
    713     } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) {
    714         ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
    715     } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) {
    716         ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
    717     } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) {
    718         ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
    719     } else {
    720         g_assert_not_reached();
    721     }
    722     if (ret < 0) {
    723         blkio_destroy(&s->blkio);
    724         return ret;
    725     }
    726 
    727     if (!(flags & BDRV_O_RDWR)) {
    728         ret = blkio_set_bool(s->blkio, "read-only", true);
    729         if (ret < 0) {
    730             error_setg_errno(errp, -ret, "failed to set read-only: %s",
    731                              blkio_get_error_msg());
    732             blkio_destroy(&s->blkio);
    733             return ret;
    734         }
    735     }
    736 
    737     ret = blkio_connect(s->blkio);
    738     if (ret < 0) {
    739         error_setg_errno(errp, -ret, "blkio_connect failed: %s",
    740                          blkio_get_error_msg());
    741         blkio_destroy(&s->blkio);
    742         return ret;
    743     }
    744 
    745     ret = blkio_get_bool(s->blkio,
    746                          "needs-mem-regions",
    747                          &s->needs_mem_regions);
    748     if (ret < 0) {
    749         error_setg_errno(errp, -ret,
    750                          "failed to get needs-mem-regions: %s",
    751                          blkio_get_error_msg());
    752         blkio_destroy(&s->blkio);
    753         return ret;
    754     }
    755 
    756     ret = blkio_get_bool(s->blkio,
    757                          "needs-mem-region-fd",
    758                          &s->needs_mem_region_fd);
    759     if (ret < 0) {
    760         error_setg_errno(errp, -ret,
    761                          "failed to get needs-mem-region-fd: %s",
    762                          blkio_get_error_msg());
    763         blkio_destroy(&s->blkio);
    764         return ret;
    765     }
    766 
    767     ret = blkio_get_uint64(s->blkio,
    768                            "mem-region-alignment",
    769                            &s->mem_region_alignment);
    770     if (ret < 0) {
    771         error_setg_errno(errp, -ret,
    772                          "failed to get mem-region-alignment: %s",
    773                          blkio_get_error_msg());
    774         blkio_destroy(&s->blkio);
    775         return ret;
    776     }
    777 
    778     ret = blkio_get_bool(s->blkio,
    779                          "may-pin-mem-regions",
    780                          &s->may_pin_mem_regions);
    781     if (ret < 0) {
    782         /* Be conservative (assume pinning) if the property is not supported */
    783         s->may_pin_mem_regions = s->needs_mem_regions;
    784     }
    785 
    786     /*
    787      * Notify if libblkio drivers pin memory and prevent features like
    788      * virtio-mem from working.
    789      */
    790     if (s->may_pin_mem_regions) {
    791         ret = ram_block_discard_disable(true);
    792         if (ret < 0) {
    793             error_setg_errno(errp, -ret, "ram_block_discard_disable() failed");
    794             blkio_destroy(&s->blkio);
    795             return ret;
    796         }
    797     }
    798 
    799     ret = blkio_start(s->blkio);
    800     if (ret < 0) {
    801         error_setg_errno(errp, -ret, "blkio_start failed: %s",
    802                          blkio_get_error_msg());
    803         blkio_destroy(&s->blkio);
    804         if (s->may_pin_mem_regions) {
    805             ram_block_discard_disable(false);
    806         }
    807         return ret;
    808     }
    809 
    810     bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
    811     bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP |
    812                                BDRV_REQ_NO_FALLBACK;
    813 
    814     qemu_mutex_init(&s->blkio_lock);
    815     qemu_co_mutex_init(&s->bounce_lock);
    816     qemu_co_queue_init(&s->bounce_available);
    817     QLIST_INIT(&s->bounce_bufs);
    818     s->blkioq = blkio_get_queue(s->blkio, 0);
    819     s->completion_fd = blkioq_get_completion_fd(s->blkioq);
    820 
    821     blkio_attach_aio_context(bs, bdrv_get_aio_context(bs));
    822     return 0;
    823 }
    824 
    825 static void blkio_close(BlockDriverState *bs)
    826 {
    827     BDRVBlkioState *s = bs->opaque;
    828 
    829     /* There is no destroy() API for s->bounce_lock */
    830 
    831     qemu_mutex_destroy(&s->blkio_lock);
    832     blkio_detach_aio_context(bs);
    833     blkio_destroy(&s->blkio);
    834 
    835     if (s->may_pin_mem_regions) {
    836         ram_block_discard_disable(false);
    837     }
    838 }
    839 
    840 static int64_t blkio_getlength(BlockDriverState *bs)
    841 {
    842     BDRVBlkioState *s = bs->opaque;
    843     uint64_t capacity;
    844     int ret;
    845 
    846     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
    847         ret = blkio_get_uint64(s->blkio, "capacity", &capacity);
    848     }
    849     if (ret < 0) {
    850         return -ret;
    851     }
    852 
    853     return capacity;
    854 }
    855 
    856 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset,
    857                                        bool exact, PreallocMode prealloc,
    858                                        BdrvRequestFlags flags, Error **errp)
    859 {
    860     int64_t current_length;
    861 
    862     if (prealloc != PREALLOC_MODE_OFF) {
    863         error_setg(errp, "Unsupported preallocation mode '%s'",
    864                    PreallocMode_str(prealloc));
    865         return -ENOTSUP;
    866     }
    867 
    868     current_length = blkio_getlength(bs);
    869 
    870     if (offset > current_length) {
    871         error_setg(errp, "Cannot grow device");
    872         return -EINVAL;
    873     } else if (exact && offset != current_length) {
    874         error_setg(errp, "Cannot resize device");
    875         return -ENOTSUP;
    876     }
    877 
    878     return 0;
    879 }
    880 
    881 static int blkio_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    882 {
    883     return 0;
    884 }
    885 
    886 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
    887 {
    888     BDRVBlkioState *s = bs->opaque;
    889     QEMU_LOCK_GUARD(&s->blkio_lock);
    890     int value;
    891     int ret;
    892 
    893     ret = blkio_get_int(s->blkio, "request-alignment", &value);
    894     if (ret < 0) {
    895         error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s",
    896                          blkio_get_error_msg());
    897         return;
    898     }
    899     bs->bl.request_alignment = value;
    900     if (bs->bl.request_alignment < 1 ||
    901         bs->bl.request_alignment >= INT_MAX ||
    902         !is_power_of_2(bs->bl.request_alignment)) {
    903         error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", "
    904                    "must be a power of 2 less than INT_MAX",
    905                    bs->bl.request_alignment);
    906         return;
    907     }
    908 
    909     ret = blkio_get_int(s->blkio, "optimal-io-size", &value);
    910     if (ret < 0) {
    911         error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s",
    912                          blkio_get_error_msg());
    913         return;
    914     }
    915     bs->bl.opt_transfer = value;
    916     if (bs->bl.opt_transfer > INT_MAX ||
    917         (bs->bl.opt_transfer % bs->bl.request_alignment)) {
    918         error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must "
    919                    "be a multiple of %" PRIu32, bs->bl.opt_transfer,
    920                    bs->bl.request_alignment);
    921         return;
    922     }
    923 
    924     ret = blkio_get_int(s->blkio, "max-transfer", &value);
    925     if (ret < 0) {
    926         error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s",
    927                          blkio_get_error_msg());
    928         return;
    929     }
    930     bs->bl.max_transfer = value;
    931     if ((bs->bl.max_transfer % bs->bl.request_alignment) ||
    932         (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) {
    933         error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be "
    934                    "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)",
    935                    bs->bl.max_transfer, bs->bl.request_alignment,
    936                    bs->bl.opt_transfer);
    937         return;
    938     }
    939 
    940     ret = blkio_get_int(s->blkio, "buf-alignment", &value);
    941     if (ret < 0) {
    942         error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s",
    943                          blkio_get_error_msg());
    944         return;
    945     }
    946     if (value < 1) {
    947         error_setg(errp, "invalid \"buf-alignment\" value %d, must be "
    948                    "positive", value);
    949         return;
    950     }
    951     bs->bl.min_mem_alignment = value;
    952 
    953     ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value);
    954     if (ret < 0) {
    955         error_setg_errno(errp, -ret,
    956                          "failed to get \"optimal-buf-alignment\": %s",
    957                          blkio_get_error_msg());
    958         return;
    959     }
    960     if (value < 1) {
    961         error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, "
    962                    "must be positive", value);
    963         return;
    964     }
    965     bs->bl.opt_mem_alignment = value;
    966 
    967     ret = blkio_get_int(s->blkio, "max-segments", &value);
    968     if (ret < 0) {
    969         error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s",
    970                          blkio_get_error_msg());
    971         return;
    972     }
    973     if (value < 1) {
    974         error_setg(errp, "invalid \"max-segments\" value %d, must be positive",
    975                    value);
    976         return;
    977     }
    978     bs->bl.max_iov = value;
    979 }
    980 
    981 /*
    982  * TODO
    983  * Missing libblkio APIs:
    984  * - block_status
    985  * - co_invalidate_cache
    986  *
    987  * Out of scope?
    988  * - create
    989  * - truncate
    990  */
    991 
    992 #define BLKIO_DRIVER(name, ...) \
    993     { \
    994         .format_name             = name, \
    995         .protocol_name           = name, \
    996         .instance_size           = sizeof(BDRVBlkioState), \
    997         .bdrv_file_open          = blkio_file_open, \
    998         .bdrv_close              = blkio_close, \
    999         .bdrv_getlength          = blkio_getlength, \
   1000         .bdrv_co_truncate        = blkio_truncate, \
   1001         .bdrv_get_info           = blkio_get_info, \
   1002         .bdrv_attach_aio_context = blkio_attach_aio_context, \
   1003         .bdrv_detach_aio_context = blkio_detach_aio_context, \
   1004         .bdrv_co_pdiscard        = blkio_co_pdiscard, \
   1005         .bdrv_co_preadv          = blkio_co_preadv, \
   1006         .bdrv_co_pwritev         = blkio_co_pwritev, \
   1007         .bdrv_co_flush_to_disk   = blkio_co_flush, \
   1008         .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
   1009         .bdrv_io_unplug          = blkio_io_unplug, \
   1010         .bdrv_refresh_limits     = blkio_refresh_limits, \
   1011         .bdrv_register_buf       = blkio_register_buf, \
   1012         .bdrv_unregister_buf     = blkio_unregister_buf, \
   1013         __VA_ARGS__ \
   1014     }
   1015 
   1016 static BlockDriver bdrv_io_uring = BLKIO_DRIVER(
   1017     DRIVER_IO_URING,
   1018     .bdrv_needs_filename = true,
   1019 );
   1020 
   1021 static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER(
   1022     DRIVER_NVME_IO_URING,
   1023 );
   1024 
   1025 static BlockDriver bdrv_virtio_blk_vfio_pci = BLKIO_DRIVER(
   1026     DRIVER_VIRTIO_BLK_VFIO_PCI
   1027 );
   1028 
   1029 static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER(
   1030     DRIVER_VIRTIO_BLK_VHOST_USER
   1031 );
   1032 
   1033 static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER(
   1034     DRIVER_VIRTIO_BLK_VHOST_VDPA
   1035 );
   1036 
   1037 static void bdrv_blkio_init(void)
   1038 {
   1039     bdrv_register(&bdrv_io_uring);
   1040     bdrv_register(&bdrv_nvme_io_uring);
   1041     bdrv_register(&bdrv_virtio_blk_vfio_pci);
   1042     bdrv_register(&bdrv_virtio_blk_vhost_user);
   1043     bdrv_register(&bdrv_virtio_blk_vhost_vdpa);
   1044 }
   1045 
   1046 block_init(bdrv_blkio_init);