qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

io.c (114380B)


      1 /*
      2  * Block layer I/O functions
      3  *
      4  * Copyright (c) 2003 Fabrice Bellard
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a copy
      7  * of this software and associated documentation files (the "Software"), to deal
      8  * in the Software without restriction, including without limitation the rights
      9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the Software is
     11  * furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22  * THE SOFTWARE.
     23  */
     24 
     25 #include "qemu/osdep.h"
     26 #include "trace.h"
     27 #include "sysemu/block-backend.h"
     28 #include "block/aio-wait.h"
     29 #include "block/blockjob.h"
     30 #include "block/blockjob_int.h"
     31 #include "block/block_int.h"
     32 #include "block/coroutines.h"
     33 #include "block/write-threshold.h"
     34 #include "qemu/cutils.h"
     35 #include "qemu/memalign.h"
     36 #include "qapi/error.h"
     37 #include "qemu/error-report.h"
     38 #include "qemu/main-loop.h"
     39 #include "sysemu/replay.h"
     40 
     41 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
     42 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
     43 
     44 static void bdrv_parent_cb_resize(BlockDriverState *bs);
     45 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     46     int64_t offset, int64_t bytes, BdrvRequestFlags flags);
     47 
     48 static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
     49                                       bool ignore_bds_parents)
     50 {
     51     BdrvChild *c, *next;
     52 
     53     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
     54         if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
     55             continue;
     56         }
     57         bdrv_parent_drained_begin_single(c, false);
     58     }
     59 }
     60 
     61 static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c,
     62                                                    int *drained_end_counter)
     63 {
     64     assert(c->parent_quiesce_counter > 0);
     65     c->parent_quiesce_counter--;
     66     if (c->klass->drained_end) {
     67         c->klass->drained_end(c, drained_end_counter);
     68     }
     69 }
     70 
     71 void bdrv_parent_drained_end_single(BdrvChild *c)
     72 {
     73     int drained_end_counter = 0;
     74     AioContext *ctx = bdrv_child_get_parent_aio_context(c);
     75     IO_OR_GS_CODE();
     76     bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter);
     77     AIO_WAIT_WHILE(ctx, qatomic_read(&drained_end_counter) > 0);
     78 }
     79 
     80 static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
     81                                     bool ignore_bds_parents,
     82                                     int *drained_end_counter)
     83 {
     84     BdrvChild *c;
     85 
     86     QLIST_FOREACH(c, &bs->parents, next_parent) {
     87         if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
     88             continue;
     89         }
     90         bdrv_parent_drained_end_single_no_poll(c, drained_end_counter);
     91     }
     92 }
     93 
     94 static bool bdrv_parent_drained_poll_single(BdrvChild *c)
     95 {
     96     if (c->klass->drained_poll) {
     97         return c->klass->drained_poll(c);
     98     }
     99     return false;
    100 }
    101 
    102 static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
    103                                      bool ignore_bds_parents)
    104 {
    105     BdrvChild *c, *next;
    106     bool busy = false;
    107 
    108     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
    109         if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
    110             continue;
    111         }
    112         busy |= bdrv_parent_drained_poll_single(c);
    113     }
    114 
    115     return busy;
    116 }
    117 
    118 void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
    119 {
    120     AioContext *ctx = bdrv_child_get_parent_aio_context(c);
    121     IO_OR_GS_CODE();
    122     c->parent_quiesce_counter++;
    123     if (c->klass->drained_begin) {
    124         c->klass->drained_begin(c);
    125     }
    126     if (poll) {
    127         AIO_WAIT_WHILE(ctx, bdrv_parent_drained_poll_single(c));
    128     }
    129 }
    130 
    131 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
    132 {
    133     dst->pdiscard_alignment = MAX(dst->pdiscard_alignment,
    134                                   src->pdiscard_alignment);
    135     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
    136     dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
    137     dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer,
    138                                         src->max_hw_transfer);
    139     dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
    140                                  src->opt_mem_alignment);
    141     dst->min_mem_alignment = MAX(dst->min_mem_alignment,
    142                                  src->min_mem_alignment);
    143     dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
    144     dst->max_hw_iov = MIN_NON_ZERO(dst->max_hw_iov, src->max_hw_iov);
    145 }
    146 
    147 typedef struct BdrvRefreshLimitsState {
    148     BlockDriverState *bs;
    149     BlockLimits old_bl;
    150 } BdrvRefreshLimitsState;
    151 
    152 static void bdrv_refresh_limits_abort(void *opaque)
    153 {
    154     BdrvRefreshLimitsState *s = opaque;
    155 
    156     s->bs->bl = s->old_bl;
    157 }
    158 
    159 static TransactionActionDrv bdrv_refresh_limits_drv = {
    160     .abort = bdrv_refresh_limits_abort,
    161     .clean = g_free,
    162 };
    163 
    164 /* @tran is allowed to be NULL, in this case no rollback is possible. */
    165 void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp)
    166 {
    167     ERRP_GUARD();
    168     BlockDriver *drv = bs->drv;
    169     BdrvChild *c;
    170     bool have_limits;
    171 
    172     GLOBAL_STATE_CODE();
    173 
    174     if (tran) {
    175         BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1);
    176         *s = (BdrvRefreshLimitsState) {
    177             .bs = bs,
    178             .old_bl = bs->bl,
    179         };
    180         tran_add(tran, &bdrv_refresh_limits_drv, s);
    181     }
    182 
    183     memset(&bs->bl, 0, sizeof(bs->bl));
    184 
    185     if (!drv) {
    186         return;
    187     }
    188 
    189     /* Default alignment based on whether driver has byte interface */
    190     bs->bl.request_alignment = (drv->bdrv_co_preadv ||
    191                                 drv->bdrv_aio_preadv ||
    192                                 drv->bdrv_co_preadv_part) ? 1 : 512;
    193 
    194     /* Take some limits from the children as a default */
    195     have_limits = false;
    196     QLIST_FOREACH(c, &bs->children, next) {
    197         if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW))
    198         {
    199             bdrv_merge_limits(&bs->bl, &c->bs->bl);
    200             have_limits = true;
    201         }
    202     }
    203 
    204     if (!have_limits) {
    205         bs->bl.min_mem_alignment = 512;
    206         bs->bl.opt_mem_alignment = qemu_real_host_page_size();
    207 
    208         /* Safe default since most protocols use readv()/writev()/etc */
    209         bs->bl.max_iov = IOV_MAX;
    210     }
    211 
    212     /* Then let the driver override it */
    213     if (drv->bdrv_refresh_limits) {
    214         drv->bdrv_refresh_limits(bs, errp);
    215         if (*errp) {
    216             return;
    217         }
    218     }
    219 
    220     if (bs->bl.request_alignment > BDRV_MAX_ALIGNMENT) {
    221         error_setg(errp, "Driver requires too large request alignment");
    222     }
    223 }
    224 
    225 /**
    226  * The copy-on-read flag is actually a reference count so multiple users may
    227  * use the feature without worrying about clobbering its previous state.
    228  * Copy-on-read stays enabled until all users have called to disable it.
    229  */
    230 void bdrv_enable_copy_on_read(BlockDriverState *bs)
    231 {
    232     IO_CODE();
    233     qatomic_inc(&bs->copy_on_read);
    234 }
    235 
    236 void bdrv_disable_copy_on_read(BlockDriverState *bs)
    237 {
    238     int old = qatomic_fetch_dec(&bs->copy_on_read);
    239     IO_CODE();
    240     assert(old >= 1);
    241 }
    242 
    243 typedef struct {
    244     Coroutine *co;
    245     BlockDriverState *bs;
    246     bool done;
    247     bool begin;
    248     bool recursive;
    249     bool poll;
    250     BdrvChild *parent;
    251     bool ignore_bds_parents;
    252     int *drained_end_counter;
    253 } BdrvCoDrainData;
    254 
    255 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
    256 {
    257     BdrvCoDrainData *data = opaque;
    258     BlockDriverState *bs = data->bs;
    259 
    260     if (data->begin) {
    261         bs->drv->bdrv_co_drain_begin(bs);
    262     } else {
    263         bs->drv->bdrv_co_drain_end(bs);
    264     }
    265 
    266     /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */
    267     qatomic_mb_set(&data->done, true);
    268     if (!data->begin) {
    269         qatomic_dec(data->drained_end_counter);
    270     }
    271     bdrv_dec_in_flight(bs);
    272 
    273     g_free(data);
    274 }
    275 
    276 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
    277 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin,
    278                               int *drained_end_counter)
    279 {
    280     BdrvCoDrainData *data;
    281 
    282     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
    283             (!begin && !bs->drv->bdrv_co_drain_end)) {
    284         return;
    285     }
    286 
    287     data = g_new(BdrvCoDrainData, 1);
    288     *data = (BdrvCoDrainData) {
    289         .bs = bs,
    290         .done = false,
    291         .begin = begin,
    292         .drained_end_counter = drained_end_counter,
    293     };
    294 
    295     if (!begin) {
    296         qatomic_inc(drained_end_counter);
    297     }
    298 
    299     /* Make sure the driver callback completes during the polling phase for
    300      * drain_begin. */
    301     bdrv_inc_in_flight(bs);
    302     data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
    303     aio_co_schedule(bdrv_get_aio_context(bs), data->co);
    304 }
    305 
    306 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
    307 bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
    308                      BdrvChild *ignore_parent, bool ignore_bds_parents)
    309 {
    310     BdrvChild *child, *next;
    311     IO_OR_GS_CODE();
    312 
    313     if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
    314         return true;
    315     }
    316 
    317     if (qatomic_read(&bs->in_flight)) {
    318         return true;
    319     }
    320 
    321     if (recursive) {
    322         assert(!ignore_bds_parents);
    323         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
    324             if (bdrv_drain_poll(child->bs, recursive, child, false)) {
    325                 return true;
    326             }
    327         }
    328     }
    329 
    330     return false;
    331 }
    332 
    333 static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
    334                                       BdrvChild *ignore_parent)
    335 {
    336     return bdrv_drain_poll(bs, recursive, ignore_parent, false);
    337 }
    338 
    339 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
    340                                   BdrvChild *parent, bool ignore_bds_parents,
    341                                   bool poll);
    342 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
    343                                 BdrvChild *parent, bool ignore_bds_parents,
    344                                 int *drained_end_counter);
    345 
    346 static void bdrv_co_drain_bh_cb(void *opaque)
    347 {
    348     BdrvCoDrainData *data = opaque;
    349     Coroutine *co = data->co;
    350     BlockDriverState *bs = data->bs;
    351 
    352     if (bs) {
    353         AioContext *ctx = bdrv_get_aio_context(bs);
    354         aio_context_acquire(ctx);
    355         bdrv_dec_in_flight(bs);
    356         if (data->begin) {
    357             assert(!data->drained_end_counter);
    358             bdrv_do_drained_begin(bs, data->recursive, data->parent,
    359                                   data->ignore_bds_parents, data->poll);
    360         } else {
    361             assert(!data->poll);
    362             bdrv_do_drained_end(bs, data->recursive, data->parent,
    363                                 data->ignore_bds_parents,
    364                                 data->drained_end_counter);
    365         }
    366         aio_context_release(ctx);
    367     } else {
    368         assert(data->begin);
    369         bdrv_drain_all_begin();
    370     }
    371 
    372     data->done = true;
    373     aio_co_wake(co);
    374 }
    375 
    376 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
    377                                                 bool begin, bool recursive,
    378                                                 BdrvChild *parent,
    379                                                 bool ignore_bds_parents,
    380                                                 bool poll,
    381                                                 int *drained_end_counter)
    382 {
    383     BdrvCoDrainData data;
    384     Coroutine *self = qemu_coroutine_self();
    385     AioContext *ctx = bdrv_get_aio_context(bs);
    386     AioContext *co_ctx = qemu_coroutine_get_aio_context(self);
    387 
    388     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
    389      * other coroutines run if they were queued by aio_co_enter(). */
    390 
    391     assert(qemu_in_coroutine());
    392     data = (BdrvCoDrainData) {
    393         .co = self,
    394         .bs = bs,
    395         .done = false,
    396         .begin = begin,
    397         .recursive = recursive,
    398         .parent = parent,
    399         .ignore_bds_parents = ignore_bds_parents,
    400         .poll = poll,
    401         .drained_end_counter = drained_end_counter,
    402     };
    403 
    404     if (bs) {
    405         bdrv_inc_in_flight(bs);
    406     }
    407 
    408     /*
    409      * Temporarily drop the lock across yield or we would get deadlocks.
    410      * bdrv_co_drain_bh_cb() reaquires the lock as needed.
    411      *
    412      * When we yield below, the lock for the current context will be
    413      * released, so if this is actually the lock that protects bs, don't drop
    414      * it a second time.
    415      */
    416     if (ctx != co_ctx) {
    417         aio_context_release(ctx);
    418     }
    419     replay_bh_schedule_oneshot_event(ctx, bdrv_co_drain_bh_cb, &data);
    420 
    421     qemu_coroutine_yield();
    422     /* If we are resumed from some other event (such as an aio completion or a
    423      * timer callback), it is a bug in the caller that should be fixed. */
    424     assert(data.done);
    425 
    426     /* Reaquire the AioContext of bs if we dropped it */
    427     if (ctx != co_ctx) {
    428         aio_context_acquire(ctx);
    429     }
    430 }
    431 
    432 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
    433                                    BdrvChild *parent, bool ignore_bds_parents)
    434 {
    435     IO_OR_GS_CODE();
    436     assert(!qemu_in_coroutine());
    437 
    438     /* Stop things in parent-to-child order */
    439     if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) {
    440         aio_disable_external(bdrv_get_aio_context(bs));
    441     }
    442 
    443     bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
    444     bdrv_drain_invoke(bs, true, NULL);
    445 }
    446 
    447 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
    448                                   BdrvChild *parent, bool ignore_bds_parents,
    449                                   bool poll)
    450 {
    451     BdrvChild *child, *next;
    452 
    453     if (qemu_in_coroutine()) {
    454         bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
    455                                poll, NULL);
    456         return;
    457     }
    458 
    459     bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
    460 
    461     if (recursive) {
    462         assert(!ignore_bds_parents);
    463         bs->recursive_quiesce_counter++;
    464         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
    465             bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
    466                                   false);
    467         }
    468     }
    469 
    470     /*
    471      * Wait for drained requests to finish.
    472      *
    473      * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
    474      * call is needed so things in this AioContext can make progress even
    475      * though we don't return to the main AioContext loop - this automatically
    476      * includes other nodes in the same AioContext and therefore all child
    477      * nodes.
    478      */
    479     if (poll) {
    480         assert(!ignore_bds_parents);
    481         BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
    482     }
    483 }
    484 
    485 void bdrv_drained_begin(BlockDriverState *bs)
    486 {
    487     IO_OR_GS_CODE();
    488     bdrv_do_drained_begin(bs, false, NULL, false, true);
    489 }
    490 
    491 void bdrv_subtree_drained_begin(BlockDriverState *bs)
    492 {
    493     IO_OR_GS_CODE();
    494     bdrv_do_drained_begin(bs, true, NULL, false, true);
    495 }
    496 
    497 /**
    498  * This function does not poll, nor must any of its recursively called
    499  * functions.  The *drained_end_counter pointee will be incremented
    500  * once for every background operation scheduled, and decremented once
    501  * the operation settles.  Therefore, the pointer must remain valid
    502  * until the pointee reaches 0.  That implies that whoever sets up the
    503  * pointee has to poll until it is 0.
    504  *
    505  * We use atomic operations to access *drained_end_counter, because
    506  * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of
    507  *     @bs may contain nodes in different AioContexts,
    508  * (2) bdrv_drain_all_end() uses the same counter for all nodes,
    509  *     regardless of which AioContext they are in.
    510  */
    511 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
    512                                 BdrvChild *parent, bool ignore_bds_parents,
    513                                 int *drained_end_counter)
    514 {
    515     BdrvChild *child;
    516     int old_quiesce_counter;
    517 
    518     assert(drained_end_counter != NULL);
    519 
    520     if (qemu_in_coroutine()) {
    521         bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
    522                                false, drained_end_counter);
    523         return;
    524     }
    525     assert(bs->quiesce_counter > 0);
    526 
    527     /* Re-enable things in child-to-parent order */
    528     bdrv_drain_invoke(bs, false, drained_end_counter);
    529     bdrv_parent_drained_end(bs, parent, ignore_bds_parents,
    530                             drained_end_counter);
    531 
    532     old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter);
    533     if (old_quiesce_counter == 1) {
    534         aio_enable_external(bdrv_get_aio_context(bs));
    535     }
    536 
    537     if (recursive) {
    538         assert(!ignore_bds_parents);
    539         bs->recursive_quiesce_counter--;
    540         QLIST_FOREACH(child, &bs->children, next) {
    541             bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents,
    542                                 drained_end_counter);
    543         }
    544     }
    545 }
    546 
    547 void bdrv_drained_end(BlockDriverState *bs)
    548 {
    549     int drained_end_counter = 0;
    550     IO_OR_GS_CODE();
    551     bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter);
    552     BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
    553 }
    554 
    555 void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter)
    556 {
    557     IO_CODE();
    558     bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter);
    559 }
    560 
    561 void bdrv_subtree_drained_end(BlockDriverState *bs)
    562 {
    563     int drained_end_counter = 0;
    564     IO_OR_GS_CODE();
    565     bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter);
    566     BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
    567 }
    568 
    569 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
    570 {
    571     int i;
    572     IO_OR_GS_CODE();
    573 
    574     for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
    575         bdrv_do_drained_begin(child->bs, true, child, false, true);
    576     }
    577 }
    578 
    579 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
    580 {
    581     int drained_end_counter = 0;
    582     int i;
    583     IO_OR_GS_CODE();
    584 
    585     for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
    586         bdrv_do_drained_end(child->bs, true, child, false,
    587                             &drained_end_counter);
    588     }
    589 
    590     BDRV_POLL_WHILE(child->bs, qatomic_read(&drained_end_counter) > 0);
    591 }
    592 
    593 void bdrv_drain(BlockDriverState *bs)
    594 {
    595     IO_OR_GS_CODE();
    596     bdrv_drained_begin(bs);
    597     bdrv_drained_end(bs);
    598 }
    599 
    600 static void bdrv_drain_assert_idle(BlockDriverState *bs)
    601 {
    602     BdrvChild *child, *next;
    603 
    604     assert(qatomic_read(&bs->in_flight) == 0);
    605     QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
    606         bdrv_drain_assert_idle(child->bs);
    607     }
    608 }
    609 
    610 unsigned int bdrv_drain_all_count = 0;
    611 
    612 static bool bdrv_drain_all_poll(void)
    613 {
    614     BlockDriverState *bs = NULL;
    615     bool result = false;
    616     GLOBAL_STATE_CODE();
    617 
    618     /* bdrv_drain_poll() can't make changes to the graph and we are holding the
    619      * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
    620     while ((bs = bdrv_next_all_states(bs))) {
    621         AioContext *aio_context = bdrv_get_aio_context(bs);
    622         aio_context_acquire(aio_context);
    623         result |= bdrv_drain_poll(bs, false, NULL, true);
    624         aio_context_release(aio_context);
    625     }
    626 
    627     return result;
    628 }
    629 
    630 /*
    631  * Wait for pending requests to complete across all BlockDriverStates
    632  *
    633  * This function does not flush data to disk, use bdrv_flush_all() for that
    634  * after calling this function.
    635  *
    636  * This pauses all block jobs and disables external clients. It must
    637  * be paired with bdrv_drain_all_end().
    638  *
    639  * NOTE: no new block jobs or BlockDriverStates can be created between
    640  * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
    641  */
    642 void bdrv_drain_all_begin(void)
    643 {
    644     BlockDriverState *bs = NULL;
    645     GLOBAL_STATE_CODE();
    646 
    647     if (qemu_in_coroutine()) {
    648         bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL);
    649         return;
    650     }
    651 
    652     /*
    653      * bdrv queue is managed by record/replay,
    654      * waiting for finishing the I/O requests may
    655      * be infinite
    656      */
    657     if (replay_events_enabled()) {
    658         return;
    659     }
    660 
    661     /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
    662      * loop AioContext, so make sure we're in the main context. */
    663     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
    664     assert(bdrv_drain_all_count < INT_MAX);
    665     bdrv_drain_all_count++;
    666 
    667     /* Quiesce all nodes, without polling in-flight requests yet. The graph
    668      * cannot change during this loop. */
    669     while ((bs = bdrv_next_all_states(bs))) {
    670         AioContext *aio_context = bdrv_get_aio_context(bs);
    671 
    672         aio_context_acquire(aio_context);
    673         bdrv_do_drained_begin(bs, false, NULL, true, false);
    674         aio_context_release(aio_context);
    675     }
    676 
    677     /* Now poll the in-flight requests */
    678     AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
    679 
    680     while ((bs = bdrv_next_all_states(bs))) {
    681         bdrv_drain_assert_idle(bs);
    682     }
    683 }
    684 
    685 void bdrv_drain_all_end_quiesce(BlockDriverState *bs)
    686 {
    687     int drained_end_counter = 0;
    688     GLOBAL_STATE_CODE();
    689 
    690     g_assert(bs->quiesce_counter > 0);
    691     g_assert(!bs->refcnt);
    692 
    693     while (bs->quiesce_counter) {
    694         bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
    695     }
    696     BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
    697 }
    698 
    699 void bdrv_drain_all_end(void)
    700 {
    701     BlockDriverState *bs = NULL;
    702     int drained_end_counter = 0;
    703     GLOBAL_STATE_CODE();
    704 
    705     /*
    706      * bdrv queue is managed by record/replay,
    707      * waiting for finishing the I/O requests may
    708      * be endless
    709      */
    710     if (replay_events_enabled()) {
    711         return;
    712     }
    713 
    714     while ((bs = bdrv_next_all_states(bs))) {
    715         AioContext *aio_context = bdrv_get_aio_context(bs);
    716 
    717         aio_context_acquire(aio_context);
    718         bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
    719         aio_context_release(aio_context);
    720     }
    721 
    722     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
    723     AIO_WAIT_WHILE(NULL, qatomic_read(&drained_end_counter) > 0);
    724 
    725     assert(bdrv_drain_all_count > 0);
    726     bdrv_drain_all_count--;
    727 }
    728 
    729 void bdrv_drain_all(void)
    730 {
    731     GLOBAL_STATE_CODE();
    732     bdrv_drain_all_begin();
    733     bdrv_drain_all_end();
    734 }
    735 
    736 /**
    737  * Remove an active request from the tracked requests list
    738  *
    739  * This function should be called when a tracked request is completing.
    740  */
    741 static void coroutine_fn tracked_request_end(BdrvTrackedRequest *req)
    742 {
    743     if (req->serialising) {
    744         qatomic_dec(&req->bs->serialising_in_flight);
    745     }
    746 
    747     qemu_co_mutex_lock(&req->bs->reqs_lock);
    748     QLIST_REMOVE(req, list);
    749     qemu_co_queue_restart_all(&req->wait_queue);
    750     qemu_co_mutex_unlock(&req->bs->reqs_lock);
    751 }
    752 
    753 /**
    754  * Add an active request to the tracked requests list
    755  */
    756 static void coroutine_fn tracked_request_begin(BdrvTrackedRequest *req,
    757                                                BlockDriverState *bs,
    758                                                int64_t offset,
    759                                                int64_t bytes,
    760                                                enum BdrvTrackedRequestType type)
    761 {
    762     bdrv_check_request(offset, bytes, &error_abort);
    763 
    764     *req = (BdrvTrackedRequest){
    765         .bs = bs,
    766         .offset         = offset,
    767         .bytes          = bytes,
    768         .type           = type,
    769         .co             = qemu_coroutine_self(),
    770         .serialising    = false,
    771         .overlap_offset = offset,
    772         .overlap_bytes  = bytes,
    773     };
    774 
    775     qemu_co_queue_init(&req->wait_queue);
    776 
    777     qemu_co_mutex_lock(&bs->reqs_lock);
    778     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
    779     qemu_co_mutex_unlock(&bs->reqs_lock);
    780 }
    781 
    782 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
    783                                      int64_t offset, int64_t bytes)
    784 {
    785     bdrv_check_request(offset, bytes, &error_abort);
    786 
    787     /*        aaaa   bbbb */
    788     if (offset >= req->overlap_offset + req->overlap_bytes) {
    789         return false;
    790     }
    791     /* bbbb   aaaa        */
    792     if (req->overlap_offset >= offset + bytes) {
    793         return false;
    794     }
    795     return true;
    796 }
    797 
    798 /* Called with self->bs->reqs_lock held */
    799 static coroutine_fn BdrvTrackedRequest *
    800 bdrv_find_conflicting_request(BdrvTrackedRequest *self)
    801 {
    802     BdrvTrackedRequest *req;
    803 
    804     QLIST_FOREACH(req, &self->bs->tracked_requests, list) {
    805         if (req == self || (!req->serialising && !self->serialising)) {
    806             continue;
    807         }
    808         if (tracked_request_overlaps(req, self->overlap_offset,
    809                                      self->overlap_bytes))
    810         {
    811             /*
    812              * Hitting this means there was a reentrant request, for
    813              * example, a block driver issuing nested requests.  This must
    814              * never happen since it means deadlock.
    815              */
    816             assert(qemu_coroutine_self() != req->co);
    817 
    818             /*
    819              * If the request is already (indirectly) waiting for us, or
    820              * will wait for us as soon as it wakes up, then just go on
    821              * (instead of producing a deadlock in the former case).
    822              */
    823             if (!req->waiting_for) {
    824                 return req;
    825             }
    826         }
    827     }
    828 
    829     return NULL;
    830 }
    831 
    832 /* Called with self->bs->reqs_lock held */
    833 static void coroutine_fn
    834 bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self)
    835 {
    836     BdrvTrackedRequest *req;
    837 
    838     while ((req = bdrv_find_conflicting_request(self))) {
    839         self->waiting_for = req;
    840         qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock);
    841         self->waiting_for = NULL;
    842     }
    843 }
    844 
    845 /* Called with req->bs->reqs_lock held */
    846 static void tracked_request_set_serialising(BdrvTrackedRequest *req,
    847                                             uint64_t align)
    848 {
    849     int64_t overlap_offset = req->offset & ~(align - 1);
    850     int64_t overlap_bytes =
    851         ROUND_UP(req->offset + req->bytes, align) - overlap_offset;
    852 
    853     bdrv_check_request(req->offset, req->bytes, &error_abort);
    854 
    855     if (!req->serialising) {
    856         qatomic_inc(&req->bs->serialising_in_flight);
    857         req->serialising = true;
    858     }
    859 
    860     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
    861     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
    862 }
    863 
    864 /**
    865  * Return the tracked request on @bs for the current coroutine, or
    866  * NULL if there is none.
    867  */
    868 BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
    869 {
    870     BdrvTrackedRequest *req;
    871     Coroutine *self = qemu_coroutine_self();
    872     IO_CODE();
    873 
    874     QLIST_FOREACH(req, &bs->tracked_requests, list) {
    875         if (req->co == self) {
    876             return req;
    877         }
    878     }
    879 
    880     return NULL;
    881 }
    882 
    883 /**
    884  * Round a region to cluster boundaries
    885  */
    886 void bdrv_round_to_clusters(BlockDriverState *bs,
    887                             int64_t offset, int64_t bytes,
    888                             int64_t *cluster_offset,
    889                             int64_t *cluster_bytes)
    890 {
    891     BlockDriverInfo bdi;
    892     IO_CODE();
    893     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
    894         *cluster_offset = offset;
    895         *cluster_bytes = bytes;
    896     } else {
    897         int64_t c = bdi.cluster_size;
    898         *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
    899         *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
    900     }
    901 }
    902 
    903 static int bdrv_get_cluster_size(BlockDriverState *bs)
    904 {
    905     BlockDriverInfo bdi;
    906     int ret;
    907 
    908     ret = bdrv_get_info(bs, &bdi);
    909     if (ret < 0 || bdi.cluster_size == 0) {
    910         return bs->bl.request_alignment;
    911     } else {
    912         return bdi.cluster_size;
    913     }
    914 }
    915 
    916 void bdrv_inc_in_flight(BlockDriverState *bs)
    917 {
    918     IO_CODE();
    919     qatomic_inc(&bs->in_flight);
    920 }
    921 
    922 void bdrv_wakeup(BlockDriverState *bs)
    923 {
    924     IO_CODE();
    925     aio_wait_kick();
    926 }
    927 
    928 void bdrv_dec_in_flight(BlockDriverState *bs)
    929 {
    930     IO_CODE();
    931     qatomic_dec(&bs->in_flight);
    932     bdrv_wakeup(bs);
    933 }
    934 
    935 static void coroutine_fn
    936 bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
    937 {
    938     BlockDriverState *bs = self->bs;
    939 
    940     if (!qatomic_read(&bs->serialising_in_flight)) {
    941         return;
    942     }
    943 
    944     qemu_co_mutex_lock(&bs->reqs_lock);
    945     bdrv_wait_serialising_requests_locked(self);
    946     qemu_co_mutex_unlock(&bs->reqs_lock);
    947 }
    948 
    949 void coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
    950                                                 uint64_t align)
    951 {
    952     IO_CODE();
    953 
    954     qemu_co_mutex_lock(&req->bs->reqs_lock);
    955 
    956     tracked_request_set_serialising(req, align);
    957     bdrv_wait_serialising_requests_locked(req);
    958 
    959     qemu_co_mutex_unlock(&req->bs->reqs_lock);
    960 }
    961 
    962 int bdrv_check_qiov_request(int64_t offset, int64_t bytes,
    963                             QEMUIOVector *qiov, size_t qiov_offset,
    964                             Error **errp)
    965 {
    966     /*
    967      * Check generic offset/bytes correctness
    968      */
    969 
    970     if (offset < 0) {
    971         error_setg(errp, "offset is negative: %" PRIi64, offset);
    972         return -EIO;
    973     }
    974 
    975     if (bytes < 0) {
    976         error_setg(errp, "bytes is negative: %" PRIi64, bytes);
    977         return -EIO;
    978     }
    979 
    980     if (bytes > BDRV_MAX_LENGTH) {
    981         error_setg(errp, "bytes(%" PRIi64 ") exceeds maximum(%" PRIi64 ")",
    982                    bytes, BDRV_MAX_LENGTH);
    983         return -EIO;
    984     }
    985 
    986     if (offset > BDRV_MAX_LENGTH) {
    987         error_setg(errp, "offset(%" PRIi64 ") exceeds maximum(%" PRIi64 ")",
    988                    offset, BDRV_MAX_LENGTH);
    989         return -EIO;
    990     }
    991 
    992     if (offset > BDRV_MAX_LENGTH - bytes) {
    993         error_setg(errp, "sum of offset(%" PRIi64 ") and bytes(%" PRIi64 ") "
    994                    "exceeds maximum(%" PRIi64 ")", offset, bytes,
    995                    BDRV_MAX_LENGTH);
    996         return -EIO;
    997     }
    998 
    999     if (!qiov) {
   1000         return 0;
   1001     }
   1002 
   1003     /*
   1004      * Check qiov and qiov_offset
   1005      */
   1006 
   1007     if (qiov_offset > qiov->size) {
   1008         error_setg(errp, "qiov_offset(%zu) overflow io vector size(%zu)",
   1009                    qiov_offset, qiov->size);
   1010         return -EIO;
   1011     }
   1012 
   1013     if (bytes > qiov->size - qiov_offset) {
   1014         error_setg(errp, "bytes(%" PRIi64 ") + qiov_offset(%zu) overflow io "
   1015                    "vector size(%zu)", bytes, qiov_offset, qiov->size);
   1016         return -EIO;
   1017     }
   1018 
   1019     return 0;
   1020 }
   1021 
   1022 int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp)
   1023 {
   1024     return bdrv_check_qiov_request(offset, bytes, NULL, 0, errp);
   1025 }
   1026 
   1027 static int bdrv_check_request32(int64_t offset, int64_t bytes,
   1028                                 QEMUIOVector *qiov, size_t qiov_offset)
   1029 {
   1030     int ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL);
   1031     if (ret < 0) {
   1032         return ret;
   1033     }
   1034 
   1035     if (bytes > BDRV_REQUEST_MAX_BYTES) {
   1036         return -EIO;
   1037     }
   1038 
   1039     return 0;
   1040 }
   1041 
   1042 /*
   1043  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
   1044  * The operation is sped up by checking the block status and only writing
   1045  * zeroes to the device if they currently do not return zeroes. Optional
   1046  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
   1047  * BDRV_REQ_FUA).
   1048  *
   1049  * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite().
   1050  */
   1051 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
   1052 {
   1053     int ret;
   1054     int64_t target_size, bytes, offset = 0;
   1055     BlockDriverState *bs = child->bs;
   1056     IO_CODE();
   1057 
   1058     target_size = bdrv_getlength(bs);
   1059     if (target_size < 0) {
   1060         return target_size;
   1061     }
   1062 
   1063     for (;;) {
   1064         bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
   1065         if (bytes <= 0) {
   1066             return 0;
   1067         }
   1068         ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
   1069         if (ret < 0) {
   1070             return ret;
   1071         }
   1072         if (ret & BDRV_BLOCK_ZERO) {
   1073             offset += bytes;
   1074             continue;
   1075         }
   1076         ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
   1077         if (ret < 0) {
   1078             return ret;
   1079         }
   1080         offset += bytes;
   1081     }
   1082 }
   1083 
   1084 /*
   1085  * Writes to the file and ensures that no writes are reordered across this
   1086  * request (acts as a barrier)
   1087  *
   1088  * Returns 0 on success, -errno in error cases.
   1089  */
   1090 int coroutine_fn bdrv_co_pwrite_sync(BdrvChild *child, int64_t offset,
   1091                                      int64_t bytes, const void *buf,
   1092                                      BdrvRequestFlags flags)
   1093 {
   1094     int ret;
   1095     IO_CODE();
   1096 
   1097     ret = bdrv_co_pwrite(child, offset, bytes, buf, flags);
   1098     if (ret < 0) {
   1099         return ret;
   1100     }
   1101 
   1102     ret = bdrv_co_flush(child->bs);
   1103     if (ret < 0) {
   1104         return ret;
   1105     }
   1106 
   1107     return 0;
   1108 }
   1109 
   1110 typedef struct CoroutineIOCompletion {
   1111     Coroutine *coroutine;
   1112     int ret;
   1113 } CoroutineIOCompletion;
   1114 
   1115 static void bdrv_co_io_em_complete(void *opaque, int ret)
   1116 {
   1117     CoroutineIOCompletion *co = opaque;
   1118 
   1119     co->ret = ret;
   1120     aio_co_wake(co->coroutine);
   1121 }
   1122 
   1123 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
   1124                                            int64_t offset, int64_t bytes,
   1125                                            QEMUIOVector *qiov,
   1126                                            size_t qiov_offset, int flags)
   1127 {
   1128     BlockDriver *drv = bs->drv;
   1129     int64_t sector_num;
   1130     unsigned int nb_sectors;
   1131     QEMUIOVector local_qiov;
   1132     int ret;
   1133 
   1134     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
   1135     assert(!(flags & ~bs->supported_read_flags));
   1136 
   1137     if (!drv) {
   1138         return -ENOMEDIUM;
   1139     }
   1140 
   1141     if (drv->bdrv_co_preadv_part) {
   1142         return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset,
   1143                                         flags);
   1144     }
   1145 
   1146     if (qiov_offset > 0 || bytes != qiov->size) {
   1147         qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
   1148         qiov = &local_qiov;
   1149     }
   1150 
   1151     if (drv->bdrv_co_preadv) {
   1152         ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
   1153         goto out;
   1154     }
   1155 
   1156     if (drv->bdrv_aio_preadv) {
   1157         BlockAIOCB *acb;
   1158         CoroutineIOCompletion co = {
   1159             .coroutine = qemu_coroutine_self(),
   1160         };
   1161 
   1162         acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
   1163                                    bdrv_co_io_em_complete, &co);
   1164         if (acb == NULL) {
   1165             ret = -EIO;
   1166             goto out;
   1167         } else {
   1168             qemu_coroutine_yield();
   1169             ret = co.ret;
   1170             goto out;
   1171         }
   1172     }
   1173 
   1174     sector_num = offset >> BDRV_SECTOR_BITS;
   1175     nb_sectors = bytes >> BDRV_SECTOR_BITS;
   1176 
   1177     assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
   1178     assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
   1179     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
   1180     assert(drv->bdrv_co_readv);
   1181 
   1182     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
   1183 
   1184 out:
   1185     if (qiov == &local_qiov) {
   1186         qemu_iovec_destroy(&local_qiov);
   1187     }
   1188 
   1189     return ret;
   1190 }
   1191 
   1192 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
   1193                                             int64_t offset, int64_t bytes,
   1194                                             QEMUIOVector *qiov,
   1195                                             size_t qiov_offset,
   1196                                             BdrvRequestFlags flags)
   1197 {
   1198     BlockDriver *drv = bs->drv;
   1199     bool emulate_fua = false;
   1200     int64_t sector_num;
   1201     unsigned int nb_sectors;
   1202     QEMUIOVector local_qiov;
   1203     int ret;
   1204 
   1205     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
   1206 
   1207     if (!drv) {
   1208         return -ENOMEDIUM;
   1209     }
   1210 
   1211     if ((flags & BDRV_REQ_FUA) &&
   1212         (~bs->supported_write_flags & BDRV_REQ_FUA)) {
   1213         flags &= ~BDRV_REQ_FUA;
   1214         emulate_fua = true;
   1215     }
   1216 
   1217     flags &= bs->supported_write_flags;
   1218 
   1219     if (drv->bdrv_co_pwritev_part) {
   1220         ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset,
   1221                                         flags);
   1222         goto emulate_flags;
   1223     }
   1224 
   1225     if (qiov_offset > 0 || bytes != qiov->size) {
   1226         qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
   1227         qiov = &local_qiov;
   1228     }
   1229 
   1230     if (drv->bdrv_co_pwritev) {
   1231         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags);
   1232         goto emulate_flags;
   1233     }
   1234 
   1235     if (drv->bdrv_aio_pwritev) {
   1236         BlockAIOCB *acb;
   1237         CoroutineIOCompletion co = {
   1238             .coroutine = qemu_coroutine_self(),
   1239         };
   1240 
   1241         acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, flags,
   1242                                     bdrv_co_io_em_complete, &co);
   1243         if (acb == NULL) {
   1244             ret = -EIO;
   1245         } else {
   1246             qemu_coroutine_yield();
   1247             ret = co.ret;
   1248         }
   1249         goto emulate_flags;
   1250     }
   1251 
   1252     sector_num = offset >> BDRV_SECTOR_BITS;
   1253     nb_sectors = bytes >> BDRV_SECTOR_BITS;
   1254 
   1255     assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
   1256     assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
   1257     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
   1258 
   1259     assert(drv->bdrv_co_writev);
   1260     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, flags);
   1261 
   1262 emulate_flags:
   1263     if (ret == 0 && emulate_fua) {
   1264         ret = bdrv_co_flush(bs);
   1265     }
   1266 
   1267     if (qiov == &local_qiov) {
   1268         qemu_iovec_destroy(&local_qiov);
   1269     }
   1270 
   1271     return ret;
   1272 }
   1273 
   1274 static int coroutine_fn
   1275 bdrv_driver_pwritev_compressed(BlockDriverState *bs, int64_t offset,
   1276                                int64_t bytes, QEMUIOVector *qiov,
   1277                                size_t qiov_offset)
   1278 {
   1279     BlockDriver *drv = bs->drv;
   1280     QEMUIOVector local_qiov;
   1281     int ret;
   1282 
   1283     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
   1284 
   1285     if (!drv) {
   1286         return -ENOMEDIUM;
   1287     }
   1288 
   1289     if (!block_driver_can_compress(drv)) {
   1290         return -ENOTSUP;
   1291     }
   1292 
   1293     if (drv->bdrv_co_pwritev_compressed_part) {
   1294         return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes,
   1295                                                     qiov, qiov_offset);
   1296     }
   1297 
   1298     if (qiov_offset == 0) {
   1299         return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
   1300     }
   1301 
   1302     qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
   1303     ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov);
   1304     qemu_iovec_destroy(&local_qiov);
   1305 
   1306     return ret;
   1307 }
   1308 
   1309 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
   1310         int64_t offset, int64_t bytes, QEMUIOVector *qiov,
   1311         size_t qiov_offset, int flags)
   1312 {
   1313     BlockDriverState *bs = child->bs;
   1314 
   1315     /* Perform I/O through a temporary buffer so that users who scribble over
   1316      * their read buffer while the operation is in progress do not end up
   1317      * modifying the image file.  This is critical for zero-copy guest I/O
   1318      * where anything might happen inside guest memory.
   1319      */
   1320     void *bounce_buffer = NULL;
   1321 
   1322     BlockDriver *drv = bs->drv;
   1323     int64_t cluster_offset;
   1324     int64_t cluster_bytes;
   1325     int64_t skip_bytes;
   1326     int ret;
   1327     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
   1328                                     BDRV_REQUEST_MAX_BYTES);
   1329     int64_t progress = 0;
   1330     bool skip_write;
   1331 
   1332     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
   1333 
   1334     if (!drv) {
   1335         return -ENOMEDIUM;
   1336     }
   1337 
   1338     /*
   1339      * Do not write anything when the BDS is inactive.  That is not
   1340      * allowed, and it would not help.
   1341      */
   1342     skip_write = (bs->open_flags & BDRV_O_INACTIVE);
   1343 
   1344     /* FIXME We cannot require callers to have write permissions when all they
   1345      * are doing is a read request. If we did things right, write permissions
   1346      * would be obtained anyway, but internally by the copy-on-read code. As
   1347      * long as it is implemented here rather than in a separate filter driver,
   1348      * the copy-on-read code doesn't have its own BdrvChild, however, for which
   1349      * it could request permissions. Therefore we have to bypass the permission
   1350      * system for the moment. */
   1351     // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
   1352 
   1353     /* Cover entire cluster so no additional backing file I/O is required when
   1354      * allocating cluster in the image file.  Note that this value may exceed
   1355      * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
   1356      * is one reason we loop rather than doing it all at once.
   1357      */
   1358     bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
   1359     skip_bytes = offset - cluster_offset;
   1360 
   1361     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
   1362                                    cluster_offset, cluster_bytes);
   1363 
   1364     while (cluster_bytes) {
   1365         int64_t pnum;
   1366 
   1367         if (skip_write) {
   1368             ret = 1; /* "already allocated", so nothing will be copied */
   1369             pnum = MIN(cluster_bytes, max_transfer);
   1370         } else {
   1371             ret = bdrv_is_allocated(bs, cluster_offset,
   1372                                     MIN(cluster_bytes, max_transfer), &pnum);
   1373             if (ret < 0) {
   1374                 /*
   1375                  * Safe to treat errors in querying allocation as if
   1376                  * unallocated; we'll probably fail again soon on the
   1377                  * read, but at least that will set a decent errno.
   1378                  */
   1379                 pnum = MIN(cluster_bytes, max_transfer);
   1380             }
   1381 
   1382             /* Stop at EOF if the image ends in the middle of the cluster */
   1383             if (ret == 0 && pnum == 0) {
   1384                 assert(progress >= bytes);
   1385                 break;
   1386             }
   1387 
   1388             assert(skip_bytes < pnum);
   1389         }
   1390 
   1391         if (ret <= 0) {
   1392             QEMUIOVector local_qiov;
   1393 
   1394             /* Must copy-on-read; use the bounce buffer */
   1395             pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
   1396             if (!bounce_buffer) {
   1397                 int64_t max_we_need = MAX(pnum, cluster_bytes - pnum);
   1398                 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
   1399                 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
   1400 
   1401                 bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len);
   1402                 if (!bounce_buffer) {
   1403                     ret = -ENOMEM;
   1404                     goto err;
   1405                 }
   1406             }
   1407             qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
   1408 
   1409             ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
   1410                                      &local_qiov, 0, 0);
   1411             if (ret < 0) {
   1412                 goto err;
   1413             }
   1414 
   1415             bdrv_debug_event(bs, BLKDBG_COR_WRITE);
   1416             if (drv->bdrv_co_pwrite_zeroes &&
   1417                 buffer_is_zero(bounce_buffer, pnum)) {
   1418                 /* FIXME: Should we (perhaps conditionally) be setting
   1419                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
   1420                  * that still correctly reads as zero? */
   1421                 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
   1422                                                BDRV_REQ_WRITE_UNCHANGED);
   1423             } else {
   1424                 /* This does not change the data on the disk, it is not
   1425                  * necessary to flush even in cache=writethrough mode.
   1426                  */
   1427                 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
   1428                                           &local_qiov, 0,
   1429                                           BDRV_REQ_WRITE_UNCHANGED);
   1430             }
   1431 
   1432             if (ret < 0) {
   1433                 /* It might be okay to ignore write errors for guest
   1434                  * requests.  If this is a deliberate copy-on-read
   1435                  * then we don't want to ignore the error.  Simply
   1436                  * report it in all cases.
   1437                  */
   1438                 goto err;
   1439             }
   1440 
   1441             if (!(flags & BDRV_REQ_PREFETCH)) {
   1442                 qemu_iovec_from_buf(qiov, qiov_offset + progress,
   1443                                     bounce_buffer + skip_bytes,
   1444                                     MIN(pnum - skip_bytes, bytes - progress));
   1445             }
   1446         } else if (!(flags & BDRV_REQ_PREFETCH)) {
   1447             /* Read directly into the destination */
   1448             ret = bdrv_driver_preadv(bs, offset + progress,
   1449                                      MIN(pnum - skip_bytes, bytes - progress),
   1450                                      qiov, qiov_offset + progress, 0);
   1451             if (ret < 0) {
   1452                 goto err;
   1453             }
   1454         }
   1455 
   1456         cluster_offset += pnum;
   1457         cluster_bytes -= pnum;
   1458         progress += pnum - skip_bytes;
   1459         skip_bytes = 0;
   1460     }
   1461     ret = 0;
   1462 
   1463 err:
   1464     qemu_vfree(bounce_buffer);
   1465     return ret;
   1466 }
   1467 
   1468 /*
   1469  * Forwards an already correctly aligned request to the BlockDriver. This
   1470  * handles copy on read, zeroing after EOF, and fragmentation of large
   1471  * reads; any other features must be implemented by the caller.
   1472  */
   1473 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
   1474     BdrvTrackedRequest *req, int64_t offset, int64_t bytes,
   1475     int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
   1476 {
   1477     BlockDriverState *bs = child->bs;
   1478     int64_t total_bytes, max_bytes;
   1479     int ret = 0;
   1480     int64_t bytes_remaining = bytes;
   1481     int max_transfer;
   1482 
   1483     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
   1484     assert(is_power_of_2(align));
   1485     assert((offset & (align - 1)) == 0);
   1486     assert((bytes & (align - 1)) == 0);
   1487     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
   1488     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
   1489                                    align);
   1490 
   1491     /*
   1492      * TODO: We would need a per-BDS .supported_read_flags and
   1493      * potential fallback support, if we ever implement any read flags
   1494      * to pass through to drivers.  For now, there aren't any
   1495      * passthrough flags except the BDRV_REQ_REGISTERED_BUF optimization hint.
   1496      */
   1497     assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH |
   1498                        BDRV_REQ_REGISTERED_BUF)));
   1499 
   1500     /* Handle Copy on Read and associated serialisation */
   1501     if (flags & BDRV_REQ_COPY_ON_READ) {
   1502         /* If we touch the same cluster it counts as an overlap.  This
   1503          * guarantees that allocating writes will be serialized and not race
   1504          * with each other for the same cluster.  For example, in copy-on-read
   1505          * it ensures that the CoR read and write operations are atomic and
   1506          * guest writes cannot interleave between them. */
   1507         bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs));
   1508     } else {
   1509         bdrv_wait_serialising_requests(req);
   1510     }
   1511 
   1512     if (flags & BDRV_REQ_COPY_ON_READ) {
   1513         int64_t pnum;
   1514 
   1515         /* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */
   1516         flags &= ~BDRV_REQ_COPY_ON_READ;
   1517 
   1518         ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
   1519         if (ret < 0) {
   1520             goto out;
   1521         }
   1522 
   1523         if (!ret || pnum != bytes) {
   1524             ret = bdrv_co_do_copy_on_readv(child, offset, bytes,
   1525                                            qiov, qiov_offset, flags);
   1526             goto out;
   1527         } else if (flags & BDRV_REQ_PREFETCH) {
   1528             goto out;
   1529         }
   1530     }
   1531 
   1532     /* Forward the request to the BlockDriver, possibly fragmenting it */
   1533     total_bytes = bdrv_getlength(bs);
   1534     if (total_bytes < 0) {
   1535         ret = total_bytes;
   1536         goto out;
   1537     }
   1538 
   1539     assert(!(flags & ~(bs->supported_read_flags | BDRV_REQ_REGISTERED_BUF)));
   1540 
   1541     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
   1542     if (bytes <= max_bytes && bytes <= max_transfer) {
   1543         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags);
   1544         goto out;
   1545     }
   1546 
   1547     while (bytes_remaining) {
   1548         int64_t num;
   1549 
   1550         if (max_bytes) {
   1551             num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
   1552             assert(num);
   1553 
   1554             ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
   1555                                      num, qiov,
   1556                                      qiov_offset + bytes - bytes_remaining,
   1557                                      flags);
   1558             max_bytes -= num;
   1559         } else {
   1560             num = bytes_remaining;
   1561             ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining,
   1562                                     0, bytes_remaining);
   1563         }
   1564         if (ret < 0) {
   1565             goto out;
   1566         }
   1567         bytes_remaining -= num;
   1568     }
   1569 
   1570 out:
   1571     return ret < 0 ? ret : 0;
   1572 }
   1573 
   1574 /*
   1575  * Request padding
   1576  *
   1577  *  |<---- align ----->|                     |<----- align ---->|
   1578  *  |<- head ->|<------------- bytes ------------->|<-- tail -->|
   1579  *  |          |       |                     |     |            |
   1580  * -*----------$-------*-------- ... --------*-----$------------*---
   1581  *  |          |       |                     |     |            |
   1582  *  |          offset  |                     |     end          |
   1583  *  ALIGN_DOWN(offset) ALIGN_UP(offset)      ALIGN_DOWN(end)   ALIGN_UP(end)
   1584  *  [buf   ... )                             [tail_buf          )
   1585  *
   1586  * @buf is an aligned allocation needed to store @head and @tail paddings. @head
   1587  * is placed at the beginning of @buf and @tail at the @end.
   1588  *
   1589  * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk
   1590  * around tail, if tail exists.
   1591  *
   1592  * @merge_reads is true for small requests,
   1593  * if @buf_len == @head + bytes + @tail. In this case it is possible that both
   1594  * head and tail exist but @buf_len == align and @tail_buf == @buf.
   1595  */
   1596 typedef struct BdrvRequestPadding {
   1597     uint8_t *buf;
   1598     size_t buf_len;
   1599     uint8_t *tail_buf;
   1600     size_t head;
   1601     size_t tail;
   1602     bool merge_reads;
   1603     QEMUIOVector local_qiov;
   1604 } BdrvRequestPadding;
   1605 
   1606 static bool bdrv_init_padding(BlockDriverState *bs,
   1607                               int64_t offset, int64_t bytes,
   1608                               BdrvRequestPadding *pad)
   1609 {
   1610     int64_t align = bs->bl.request_alignment;
   1611     int64_t sum;
   1612 
   1613     bdrv_check_request(offset, bytes, &error_abort);
   1614     assert(align <= INT_MAX); /* documented in block/block_int.h */
   1615     assert(align <= SIZE_MAX / 2); /* so we can allocate the buffer */
   1616 
   1617     memset(pad, 0, sizeof(*pad));
   1618 
   1619     pad->head = offset & (align - 1);
   1620     pad->tail = ((offset + bytes) & (align - 1));
   1621     if (pad->tail) {
   1622         pad->tail = align - pad->tail;
   1623     }
   1624 
   1625     if (!pad->head && !pad->tail) {
   1626         return false;
   1627     }
   1628 
   1629     assert(bytes); /* Nothing good in aligning zero-length requests */
   1630 
   1631     sum = pad->head + bytes + pad->tail;
   1632     pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align;
   1633     pad->buf = qemu_blockalign(bs, pad->buf_len);
   1634     pad->merge_reads = sum == pad->buf_len;
   1635     if (pad->tail) {
   1636         pad->tail_buf = pad->buf + pad->buf_len - align;
   1637     }
   1638 
   1639     return true;
   1640 }
   1641 
   1642 static coroutine_fn int bdrv_padding_rmw_read(BdrvChild *child,
   1643                                               BdrvTrackedRequest *req,
   1644                                               BdrvRequestPadding *pad,
   1645                                               bool zero_middle)
   1646 {
   1647     QEMUIOVector local_qiov;
   1648     BlockDriverState *bs = child->bs;
   1649     uint64_t align = bs->bl.request_alignment;
   1650     int ret;
   1651 
   1652     assert(req->serialising && pad->buf);
   1653 
   1654     if (pad->head || pad->merge_reads) {
   1655         int64_t bytes = pad->merge_reads ? pad->buf_len : align;
   1656 
   1657         qemu_iovec_init_buf(&local_qiov, pad->buf, bytes);
   1658 
   1659         if (pad->head) {
   1660             bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
   1661         }
   1662         if (pad->merge_reads && pad->tail) {
   1663             bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
   1664         }
   1665         ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes,
   1666                                   align, &local_qiov, 0, 0);
   1667         if (ret < 0) {
   1668             return ret;
   1669         }
   1670         if (pad->head) {
   1671             bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
   1672         }
   1673         if (pad->merge_reads && pad->tail) {
   1674             bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
   1675         }
   1676 
   1677         if (pad->merge_reads) {
   1678             goto zero_mem;
   1679         }
   1680     }
   1681 
   1682     if (pad->tail) {
   1683         qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align);
   1684 
   1685         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
   1686         ret = bdrv_aligned_preadv(
   1687                 child, req,
   1688                 req->overlap_offset + req->overlap_bytes - align,
   1689                 align, align, &local_qiov, 0, 0);
   1690         if (ret < 0) {
   1691             return ret;
   1692         }
   1693         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
   1694     }
   1695 
   1696 zero_mem:
   1697     if (zero_middle) {
   1698         memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail);
   1699     }
   1700 
   1701     return 0;
   1702 }
   1703 
   1704 static void bdrv_padding_destroy(BdrvRequestPadding *pad)
   1705 {
   1706     if (pad->buf) {
   1707         qemu_vfree(pad->buf);
   1708         qemu_iovec_destroy(&pad->local_qiov);
   1709     }
   1710     memset(pad, 0, sizeof(*pad));
   1711 }
   1712 
   1713 /*
   1714  * bdrv_pad_request
   1715  *
   1716  * Exchange request parameters with padded request if needed. Don't include RMW
   1717  * read of padding, bdrv_padding_rmw_read() should be called separately if
   1718  * needed.
   1719  *
   1720  * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out:
   1721  *  - on function start they represent original request
   1722  *  - on failure or when padding is not needed they are unchanged
   1723  *  - on success when padding is needed they represent padded request
   1724  */
   1725 static int bdrv_pad_request(BlockDriverState *bs,
   1726                             QEMUIOVector **qiov, size_t *qiov_offset,
   1727                             int64_t *offset, int64_t *bytes,
   1728                             BdrvRequestPadding *pad, bool *padded,
   1729                             BdrvRequestFlags *flags)
   1730 {
   1731     int ret;
   1732 
   1733     bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort);
   1734 
   1735     if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
   1736         if (padded) {
   1737             *padded = false;
   1738         }
   1739         return 0;
   1740     }
   1741 
   1742     ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
   1743                                    *qiov, *qiov_offset, *bytes,
   1744                                    pad->buf + pad->buf_len - pad->tail,
   1745                                    pad->tail);
   1746     if (ret < 0) {
   1747         bdrv_padding_destroy(pad);
   1748         return ret;
   1749     }
   1750     *bytes += pad->head + pad->tail;
   1751     *offset -= pad->head;
   1752     *qiov = &pad->local_qiov;
   1753     *qiov_offset = 0;
   1754     if (padded) {
   1755         *padded = true;
   1756     }
   1757     if (flags) {
   1758         /* Can't use optimization hint with bounce buffer */
   1759         *flags &= ~BDRV_REQ_REGISTERED_BUF;
   1760     }
   1761 
   1762     return 0;
   1763 }
   1764 
   1765 int coroutine_fn bdrv_co_preadv(BdrvChild *child,
   1766     int64_t offset, int64_t bytes, QEMUIOVector *qiov,
   1767     BdrvRequestFlags flags)
   1768 {
   1769     IO_CODE();
   1770     return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
   1771 }
   1772 
   1773 int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
   1774     int64_t offset, int64_t bytes,
   1775     QEMUIOVector *qiov, size_t qiov_offset,
   1776     BdrvRequestFlags flags)
   1777 {
   1778     BlockDriverState *bs = child->bs;
   1779     BdrvTrackedRequest req;
   1780     BdrvRequestPadding pad;
   1781     int ret;
   1782     IO_CODE();
   1783 
   1784     trace_bdrv_co_preadv_part(bs, offset, bytes, flags);
   1785 
   1786     if (!bdrv_is_inserted(bs)) {
   1787         return -ENOMEDIUM;
   1788     }
   1789 
   1790     ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset);
   1791     if (ret < 0) {
   1792         return ret;
   1793     }
   1794 
   1795     if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
   1796         /*
   1797          * Aligning zero request is nonsense. Even if driver has special meaning
   1798          * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
   1799          * it to driver due to request_alignment.
   1800          *
   1801          * Still, no reason to return an error if someone do unaligned
   1802          * zero-length read occasionally.
   1803          */
   1804         return 0;
   1805     }
   1806 
   1807     bdrv_inc_in_flight(bs);
   1808 
   1809     /* Don't do copy-on-read if we read data before write operation */
   1810     if (qatomic_read(&bs->copy_on_read)) {
   1811         flags |= BDRV_REQ_COPY_ON_READ;
   1812     }
   1813 
   1814     ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
   1815                            NULL, &flags);
   1816     if (ret < 0) {
   1817         goto fail;
   1818     }
   1819 
   1820     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
   1821     ret = bdrv_aligned_preadv(child, &req, offset, bytes,
   1822                               bs->bl.request_alignment,
   1823                               qiov, qiov_offset, flags);
   1824     tracked_request_end(&req);
   1825     bdrv_padding_destroy(&pad);
   1826 
   1827 fail:
   1828     bdrv_dec_in_flight(bs);
   1829 
   1830     return ret;
   1831 }
   1832 
   1833 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
   1834     int64_t offset, int64_t bytes, BdrvRequestFlags flags)
   1835 {
   1836     BlockDriver *drv = bs->drv;
   1837     QEMUIOVector qiov;
   1838     void *buf = NULL;
   1839     int ret = 0;
   1840     bool need_flush = false;
   1841     int head = 0;
   1842     int tail = 0;
   1843 
   1844     int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes,
   1845                                             INT64_MAX);
   1846     int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
   1847                         bs->bl.request_alignment);
   1848     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
   1849 
   1850     bdrv_check_request(offset, bytes, &error_abort);
   1851 
   1852     if (!drv) {
   1853         return -ENOMEDIUM;
   1854     }
   1855 
   1856     if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
   1857         return -ENOTSUP;
   1858     }
   1859 
   1860     /* By definition there is no user buffer so this flag doesn't make sense */
   1861     if (flags & BDRV_REQ_REGISTERED_BUF) {
   1862         return -EINVAL;
   1863     }
   1864 
   1865     /* Invalidate the cached block-status data range if this write overlaps */
   1866     bdrv_bsc_invalidate_range(bs, offset, bytes);
   1867 
   1868     assert(alignment % bs->bl.request_alignment == 0);
   1869     head = offset % alignment;
   1870     tail = (offset + bytes) % alignment;
   1871     max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
   1872     assert(max_write_zeroes >= bs->bl.request_alignment);
   1873 
   1874     while (bytes > 0 && !ret) {
   1875         int64_t num = bytes;
   1876 
   1877         /* Align request.  Block drivers can expect the "bulk" of the request
   1878          * to be aligned, and that unaligned requests do not cross cluster
   1879          * boundaries.
   1880          */
   1881         if (head) {
   1882             /* Make a small request up to the first aligned sector. For
   1883              * convenience, limit this request to max_transfer even if
   1884              * we don't need to fall back to writes.  */
   1885             num = MIN(MIN(bytes, max_transfer), alignment - head);
   1886             head = (head + num) % alignment;
   1887             assert(num < max_write_zeroes);
   1888         } else if (tail && num > alignment) {
   1889             /* Shorten the request to the last aligned sector.  */
   1890             num -= tail;
   1891         }
   1892 
   1893         /* limit request size */
   1894         if (num > max_write_zeroes) {
   1895             num = max_write_zeroes;
   1896         }
   1897 
   1898         ret = -ENOTSUP;
   1899         /* First try the efficient write zeroes operation */
   1900         if (drv->bdrv_co_pwrite_zeroes) {
   1901             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
   1902                                              flags & bs->supported_zero_flags);
   1903             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
   1904                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
   1905                 need_flush = true;
   1906             }
   1907         } else {
   1908             assert(!bs->supported_zero_flags);
   1909         }
   1910 
   1911         if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) {
   1912             /* Fall back to bounce buffer if write zeroes is unsupported */
   1913             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
   1914 
   1915             if ((flags & BDRV_REQ_FUA) &&
   1916                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
   1917                 /* No need for bdrv_driver_pwrite() to do a fallback
   1918                  * flush on each chunk; use just one at the end */
   1919                 write_flags &= ~BDRV_REQ_FUA;
   1920                 need_flush = true;
   1921             }
   1922             num = MIN(num, max_transfer);
   1923             if (buf == NULL) {
   1924                 buf = qemu_try_blockalign0(bs, num);
   1925                 if (buf == NULL) {
   1926                     ret = -ENOMEM;
   1927                     goto fail;
   1928                 }
   1929             }
   1930             qemu_iovec_init_buf(&qiov, buf, num);
   1931 
   1932             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags);
   1933 
   1934             /* Keep bounce buffer around if it is big enough for all
   1935              * all future requests.
   1936              */
   1937             if (num < max_transfer) {
   1938                 qemu_vfree(buf);
   1939                 buf = NULL;
   1940             }
   1941         }
   1942 
   1943         offset += num;
   1944         bytes -= num;
   1945     }
   1946 
   1947 fail:
   1948     if (ret == 0 && need_flush) {
   1949         ret = bdrv_co_flush(bs);
   1950     }
   1951     qemu_vfree(buf);
   1952     return ret;
   1953 }
   1954 
   1955 static inline int coroutine_fn
   1956 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
   1957                           BdrvTrackedRequest *req, int flags)
   1958 {
   1959     BlockDriverState *bs = child->bs;
   1960 
   1961     bdrv_check_request(offset, bytes, &error_abort);
   1962 
   1963     if (bdrv_is_read_only(bs)) {
   1964         return -EPERM;
   1965     }
   1966 
   1967     assert(!(bs->open_flags & BDRV_O_INACTIVE));
   1968     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
   1969     assert(!(flags & ~BDRV_REQ_MASK));
   1970     assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING)));
   1971 
   1972     if (flags & BDRV_REQ_SERIALISING) {
   1973         QEMU_LOCK_GUARD(&bs->reqs_lock);
   1974 
   1975         tracked_request_set_serialising(req, bdrv_get_cluster_size(bs));
   1976 
   1977         if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) {
   1978             return -EBUSY;
   1979         }
   1980 
   1981         bdrv_wait_serialising_requests_locked(req);
   1982     } else {
   1983         bdrv_wait_serialising_requests(req);
   1984     }
   1985 
   1986     assert(req->overlap_offset <= offset);
   1987     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
   1988     assert(offset + bytes <= bs->total_sectors * BDRV_SECTOR_SIZE ||
   1989            child->perm & BLK_PERM_RESIZE);
   1990 
   1991     switch (req->type) {
   1992     case BDRV_TRACKED_WRITE:
   1993     case BDRV_TRACKED_DISCARD:
   1994         if (flags & BDRV_REQ_WRITE_UNCHANGED) {
   1995             assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
   1996         } else {
   1997             assert(child->perm & BLK_PERM_WRITE);
   1998         }
   1999         bdrv_write_threshold_check_write(bs, offset, bytes);
   2000         return 0;
   2001     case BDRV_TRACKED_TRUNCATE:
   2002         assert(child->perm & BLK_PERM_RESIZE);
   2003         return 0;
   2004     default:
   2005         abort();
   2006     }
   2007 }
   2008 
   2009 static inline void coroutine_fn
   2010 bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes,
   2011                          BdrvTrackedRequest *req, int ret)
   2012 {
   2013     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
   2014     BlockDriverState *bs = child->bs;
   2015 
   2016     bdrv_check_request(offset, bytes, &error_abort);
   2017 
   2018     qatomic_inc(&bs->write_gen);
   2019 
   2020     /*
   2021      * Discard cannot extend the image, but in error handling cases, such as
   2022      * when reverting a qcow2 cluster allocation, the discarded range can pass
   2023      * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
   2024      * here. Instead, just skip it, since semantically a discard request
   2025      * beyond EOF cannot expand the image anyway.
   2026      */
   2027     if (ret == 0 &&
   2028         (req->type == BDRV_TRACKED_TRUNCATE ||
   2029          end_sector > bs->total_sectors) &&
   2030         req->type != BDRV_TRACKED_DISCARD) {
   2031         bs->total_sectors = end_sector;
   2032         bdrv_parent_cb_resize(bs);
   2033         bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
   2034     }
   2035     if (req->bytes) {
   2036         switch (req->type) {
   2037         case BDRV_TRACKED_WRITE:
   2038             stat64_max(&bs->wr_highest_offset, offset + bytes);
   2039             /* fall through, to set dirty bits */
   2040         case BDRV_TRACKED_DISCARD:
   2041             bdrv_set_dirty(bs, offset, bytes);
   2042             break;
   2043         default:
   2044             break;
   2045         }
   2046     }
   2047 }
   2048 
   2049 /*
   2050  * Forwards an already correctly aligned write request to the BlockDriver,
   2051  * after possibly fragmenting it.
   2052  */
   2053 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
   2054     BdrvTrackedRequest *req, int64_t offset, int64_t bytes,
   2055     int64_t align, QEMUIOVector *qiov, size_t qiov_offset,
   2056     BdrvRequestFlags flags)
   2057 {
   2058     BlockDriverState *bs = child->bs;
   2059     BlockDriver *drv = bs->drv;
   2060     int ret;
   2061 
   2062     int64_t bytes_remaining = bytes;
   2063     int max_transfer;
   2064 
   2065     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
   2066 
   2067     if (!drv) {
   2068         return -ENOMEDIUM;
   2069     }
   2070 
   2071     if (bdrv_has_readonly_bitmaps(bs)) {
   2072         return -EPERM;
   2073     }
   2074 
   2075     assert(is_power_of_2(align));
   2076     assert((offset & (align - 1)) == 0);
   2077     assert((bytes & (align - 1)) == 0);
   2078     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
   2079                                    align);
   2080 
   2081     ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
   2082 
   2083     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
   2084         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
   2085         qemu_iovec_is_zero(qiov, qiov_offset, bytes)) {
   2086         flags |= BDRV_REQ_ZERO_WRITE;
   2087         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
   2088             flags |= BDRV_REQ_MAY_UNMAP;
   2089         }
   2090     }
   2091 
   2092     if (ret < 0) {
   2093         /* Do nothing, write notifier decided to fail this request */
   2094     } else if (flags & BDRV_REQ_ZERO_WRITE) {
   2095         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
   2096         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
   2097     } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
   2098         ret = bdrv_driver_pwritev_compressed(bs, offset, bytes,
   2099                                              qiov, qiov_offset);
   2100     } else if (bytes <= max_transfer) {
   2101         bdrv_debug_event(bs, BLKDBG_PWRITEV);
   2102         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags);
   2103     } else {
   2104         bdrv_debug_event(bs, BLKDBG_PWRITEV);
   2105         while (bytes_remaining) {
   2106             int num = MIN(bytes_remaining, max_transfer);
   2107             int local_flags = flags;
   2108 
   2109             assert(num);
   2110             if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
   2111                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
   2112                 /* If FUA is going to be emulated by flush, we only
   2113                  * need to flush on the last iteration */
   2114                 local_flags &= ~BDRV_REQ_FUA;
   2115             }
   2116 
   2117             ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
   2118                                       num, qiov,
   2119                                       qiov_offset + bytes - bytes_remaining,
   2120                                       local_flags);
   2121             if (ret < 0) {
   2122                 break;
   2123             }
   2124             bytes_remaining -= num;
   2125         }
   2126     }
   2127     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
   2128 
   2129     if (ret >= 0) {
   2130         ret = 0;
   2131     }
   2132     bdrv_co_write_req_finish(child, offset, bytes, req, ret);
   2133 
   2134     return ret;
   2135 }
   2136 
   2137 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
   2138                                                 int64_t offset,
   2139                                                 int64_t bytes,
   2140                                                 BdrvRequestFlags flags,
   2141                                                 BdrvTrackedRequest *req)
   2142 {
   2143     BlockDriverState *bs = child->bs;
   2144     QEMUIOVector local_qiov;
   2145     uint64_t align = bs->bl.request_alignment;
   2146     int ret = 0;
   2147     bool padding;
   2148     BdrvRequestPadding pad;
   2149 
   2150     /* This flag doesn't make sense for padding or zero writes */
   2151     flags &= ~BDRV_REQ_REGISTERED_BUF;
   2152 
   2153     padding = bdrv_init_padding(bs, offset, bytes, &pad);
   2154     if (padding) {
   2155         assert(!(flags & BDRV_REQ_NO_WAIT));
   2156         bdrv_make_request_serialising(req, align);
   2157 
   2158         bdrv_padding_rmw_read(child, req, &pad, true);
   2159 
   2160         if (pad.head || pad.merge_reads) {
   2161             int64_t aligned_offset = offset & ~(align - 1);
   2162             int64_t write_bytes = pad.merge_reads ? pad.buf_len : align;
   2163 
   2164             qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes);
   2165             ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes,
   2166                                        align, &local_qiov, 0,
   2167                                        flags & ~BDRV_REQ_ZERO_WRITE);
   2168             if (ret < 0 || pad.merge_reads) {
   2169                 /* Error or all work is done */
   2170                 goto out;
   2171             }
   2172             offset += write_bytes - pad.head;
   2173             bytes -= write_bytes - pad.head;
   2174         }
   2175     }
   2176 
   2177     assert(!bytes || (offset & (align - 1)) == 0);
   2178     if (bytes >= align) {
   2179         /* Write the aligned part in the middle. */
   2180         int64_t aligned_bytes = bytes & ~(align - 1);
   2181         ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
   2182                                    NULL, 0, flags);
   2183         if (ret < 0) {
   2184             goto out;
   2185         }
   2186         bytes -= aligned_bytes;
   2187         offset += aligned_bytes;
   2188     }
   2189 
   2190     assert(!bytes || (offset & (align - 1)) == 0);
   2191     if (bytes) {
   2192         assert(align == pad.tail + bytes);
   2193 
   2194         qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align);
   2195         ret = bdrv_aligned_pwritev(child, req, offset, align, align,
   2196                                    &local_qiov, 0,
   2197                                    flags & ~BDRV_REQ_ZERO_WRITE);
   2198     }
   2199 
   2200 out:
   2201     bdrv_padding_destroy(&pad);
   2202 
   2203     return ret;
   2204 }
   2205 
   2206 /*
   2207  * Handle a write request in coroutine context
   2208  */
   2209 int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
   2210     int64_t offset, int64_t bytes, QEMUIOVector *qiov,
   2211     BdrvRequestFlags flags)
   2212 {
   2213     IO_CODE();
   2214     return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags);
   2215 }
   2216 
   2217 int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
   2218     int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset,
   2219     BdrvRequestFlags flags)
   2220 {
   2221     BlockDriverState *bs = child->bs;
   2222     BdrvTrackedRequest req;
   2223     uint64_t align = bs->bl.request_alignment;
   2224     BdrvRequestPadding pad;
   2225     int ret;
   2226     bool padded = false;
   2227     IO_CODE();
   2228 
   2229     trace_bdrv_co_pwritev_part(child->bs, offset, bytes, flags);
   2230 
   2231     if (!bdrv_is_inserted(bs)) {
   2232         return -ENOMEDIUM;
   2233     }
   2234 
   2235     if (flags & BDRV_REQ_ZERO_WRITE) {
   2236         ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL);
   2237     } else {
   2238         ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset);
   2239     }
   2240     if (ret < 0) {
   2241         return ret;
   2242     }
   2243 
   2244     /* If the request is misaligned then we can't make it efficient */
   2245     if ((flags & BDRV_REQ_NO_FALLBACK) &&
   2246         !QEMU_IS_ALIGNED(offset | bytes, align))
   2247     {
   2248         return -ENOTSUP;
   2249     }
   2250 
   2251     if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
   2252         /*
   2253          * Aligning zero request is nonsense. Even if driver has special meaning
   2254          * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
   2255          * it to driver due to request_alignment.
   2256          *
   2257          * Still, no reason to return an error if someone do unaligned
   2258          * zero-length write occasionally.
   2259          */
   2260         return 0;
   2261     }
   2262 
   2263     if (!(flags & BDRV_REQ_ZERO_WRITE)) {
   2264         /*
   2265          * Pad request for following read-modify-write cycle.
   2266          * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do
   2267          * alignment only if there is no ZERO flag.
   2268          */
   2269         ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
   2270                                &padded, &flags);
   2271         if (ret < 0) {
   2272             return ret;
   2273         }
   2274     }
   2275 
   2276     bdrv_inc_in_flight(bs);
   2277     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
   2278 
   2279     if (flags & BDRV_REQ_ZERO_WRITE) {
   2280         assert(!padded);
   2281         ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
   2282         goto out;
   2283     }
   2284 
   2285     if (padded) {
   2286         /*
   2287          * Request was unaligned to request_alignment and therefore
   2288          * padded.  We are going to do read-modify-write, and must
   2289          * serialize the request to prevent interactions of the
   2290          * widened region with other transactions.
   2291          */
   2292         assert(!(flags & BDRV_REQ_NO_WAIT));
   2293         bdrv_make_request_serialising(&req, align);
   2294         bdrv_padding_rmw_read(child, &req, &pad, false);
   2295     }
   2296 
   2297     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
   2298                                qiov, qiov_offset, flags);
   2299 
   2300     bdrv_padding_destroy(&pad);
   2301 
   2302 out:
   2303     tracked_request_end(&req);
   2304     bdrv_dec_in_flight(bs);
   2305 
   2306     return ret;
   2307 }
   2308 
   2309 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
   2310                                        int64_t bytes, BdrvRequestFlags flags)
   2311 {
   2312     IO_CODE();
   2313     trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
   2314 
   2315     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
   2316         flags &= ~BDRV_REQ_MAY_UNMAP;
   2317     }
   2318 
   2319     return bdrv_co_pwritev(child, offset, bytes, NULL,
   2320                            BDRV_REQ_ZERO_WRITE | flags);
   2321 }
   2322 
   2323 /*
   2324  * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
   2325  */
   2326 int bdrv_flush_all(void)
   2327 {
   2328     BdrvNextIterator it;
   2329     BlockDriverState *bs = NULL;
   2330     int result = 0;
   2331 
   2332     GLOBAL_STATE_CODE();
   2333 
   2334     /*
   2335      * bdrv queue is managed by record/replay,
   2336      * creating new flush request for stopping
   2337      * the VM may break the determinism
   2338      */
   2339     if (replay_events_enabled()) {
   2340         return result;
   2341     }
   2342 
   2343     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
   2344         AioContext *aio_context = bdrv_get_aio_context(bs);
   2345         int ret;
   2346 
   2347         aio_context_acquire(aio_context);
   2348         ret = bdrv_flush(bs);
   2349         if (ret < 0 && !result) {
   2350             result = ret;
   2351         }
   2352         aio_context_release(aio_context);
   2353     }
   2354 
   2355     return result;
   2356 }
   2357 
   2358 /*
   2359  * Returns the allocation status of the specified sectors.
   2360  * Drivers not implementing the functionality are assumed to not support
   2361  * backing files, hence all their sectors are reported as allocated.
   2362  *
   2363  * If 'want_zero' is true, the caller is querying for mapping
   2364  * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
   2365  * _ZERO where possible; otherwise, the result favors larger 'pnum',
   2366  * with a focus on accurate BDRV_BLOCK_ALLOCATED.
   2367  *
   2368  * If 'offset' is beyond the end of the disk image the return value is
   2369  * BDRV_BLOCK_EOF and 'pnum' is set to 0.
   2370  *
   2371  * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
   2372  * beyond the end of the disk image it will be clamped; if 'pnum' is set to
   2373  * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
   2374  *
   2375  * 'pnum' is set to the number of bytes (including and immediately
   2376  * following the specified offset) that are easily known to be in the
   2377  * same allocated/unallocated state.  Note that a second call starting
   2378  * at the original offset plus returned pnum may have the same status.
   2379  * The returned value is non-zero on success except at end-of-file.
   2380  *
   2381  * Returns negative errno on failure.  Otherwise, if the
   2382  * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
   2383  * set to the host mapping and BDS corresponding to the guest offset.
   2384  */
   2385 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
   2386                                              bool want_zero,
   2387                                              int64_t offset, int64_t bytes,
   2388                                              int64_t *pnum, int64_t *map,
   2389                                              BlockDriverState **file)
   2390 {
   2391     int64_t total_size;
   2392     int64_t n; /* bytes */
   2393     int ret;
   2394     int64_t local_map = 0;
   2395     BlockDriverState *local_file = NULL;
   2396     int64_t aligned_offset, aligned_bytes;
   2397     uint32_t align;
   2398     bool has_filtered_child;
   2399 
   2400     assert(pnum);
   2401     *pnum = 0;
   2402     total_size = bdrv_getlength(bs);
   2403     if (total_size < 0) {
   2404         ret = total_size;
   2405         goto early_out;
   2406     }
   2407 
   2408     if (offset >= total_size) {
   2409         ret = BDRV_BLOCK_EOF;
   2410         goto early_out;
   2411     }
   2412     if (!bytes) {
   2413         ret = 0;
   2414         goto early_out;
   2415     }
   2416 
   2417     n = total_size - offset;
   2418     if (n < bytes) {
   2419         bytes = n;
   2420     }
   2421 
   2422     /* Must be non-NULL or bdrv_getlength() would have failed */
   2423     assert(bs->drv);
   2424     has_filtered_child = bdrv_filter_child(bs);
   2425     if (!bs->drv->bdrv_co_block_status && !has_filtered_child) {
   2426         *pnum = bytes;
   2427         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
   2428         if (offset + bytes == total_size) {
   2429             ret |= BDRV_BLOCK_EOF;
   2430         }
   2431         if (bs->drv->protocol_name) {
   2432             ret |= BDRV_BLOCK_OFFSET_VALID;
   2433             local_map = offset;
   2434             local_file = bs;
   2435         }
   2436         goto early_out;
   2437     }
   2438 
   2439     bdrv_inc_in_flight(bs);
   2440 
   2441     /* Round out to request_alignment boundaries */
   2442     align = bs->bl.request_alignment;
   2443     aligned_offset = QEMU_ALIGN_DOWN(offset, align);
   2444     aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
   2445 
   2446     if (bs->drv->bdrv_co_block_status) {
   2447         /*
   2448          * Use the block-status cache only for protocol nodes: Format
   2449          * drivers are generally quick to inquire the status, but protocol
   2450          * drivers often need to get information from outside of qemu, so
   2451          * we do not have control over the actual implementation.  There
   2452          * have been cases where inquiring the status took an unreasonably
   2453          * long time, and we can do nothing in qemu to fix it.
   2454          * This is especially problematic for images with large data areas,
   2455          * because finding the few holes in them and giving them special
   2456          * treatment does not gain much performance.  Therefore, we try to
   2457          * cache the last-identified data region.
   2458          *
   2459          * Second, limiting ourselves to protocol nodes allows us to assume
   2460          * the block status for data regions to be DATA | OFFSET_VALID, and
   2461          * that the host offset is the same as the guest offset.
   2462          *
   2463          * Note that it is possible that external writers zero parts of
   2464          * the cached regions without the cache being invalidated, and so
   2465          * we may report zeroes as data.  This is not catastrophic,
   2466          * however, because reporting zeroes as data is fine.
   2467          */
   2468         if (QLIST_EMPTY(&bs->children) &&
   2469             bdrv_bsc_is_data(bs, aligned_offset, pnum))
   2470         {
   2471             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
   2472             local_file = bs;
   2473             local_map = aligned_offset;
   2474         } else {
   2475             ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
   2476                                                 aligned_bytes, pnum, &local_map,
   2477                                                 &local_file);
   2478 
   2479             /*
   2480              * Note that checking QLIST_EMPTY(&bs->children) is also done when
   2481              * the cache is queried above.  Technically, we do not need to check
   2482              * it here; the worst that can happen is that we fill the cache for
   2483              * non-protocol nodes, and then it is never used.  However, filling
   2484              * the cache requires an RCU update, so double check here to avoid
   2485              * such an update if possible.
   2486              *
   2487              * Check want_zero, because we only want to update the cache when we
   2488              * have accurate information about what is zero and what is data.
   2489              */
   2490             if (want_zero &&
   2491                 ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) &&
   2492                 QLIST_EMPTY(&bs->children))
   2493             {
   2494                 /*
   2495                  * When a protocol driver reports BLOCK_OFFSET_VALID, the
   2496                  * returned local_map value must be the same as the offset we
   2497                  * have passed (aligned_offset), and local_bs must be the node
   2498                  * itself.
   2499                  * Assert this, because we follow this rule when reading from
   2500                  * the cache (see the `local_file = bs` and
   2501                  * `local_map = aligned_offset` assignments above), and the
   2502                  * result the cache delivers must be the same as the driver
   2503                  * would deliver.
   2504                  */
   2505                 assert(local_file == bs);
   2506                 assert(local_map == aligned_offset);
   2507                 bdrv_bsc_fill(bs, aligned_offset, *pnum);
   2508             }
   2509         }
   2510     } else {
   2511         /* Default code for filters */
   2512 
   2513         local_file = bdrv_filter_bs(bs);
   2514         assert(local_file);
   2515 
   2516         *pnum = aligned_bytes;
   2517         local_map = aligned_offset;
   2518         ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
   2519     }
   2520     if (ret < 0) {
   2521         *pnum = 0;
   2522         goto out;
   2523     }
   2524 
   2525     /*
   2526      * The driver's result must be a non-zero multiple of request_alignment.
   2527      * Clamp pnum and adjust map to original request.
   2528      */
   2529     assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
   2530            align > offset - aligned_offset);
   2531     if (ret & BDRV_BLOCK_RECURSE) {
   2532         assert(ret & BDRV_BLOCK_DATA);
   2533         assert(ret & BDRV_BLOCK_OFFSET_VALID);
   2534         assert(!(ret & BDRV_BLOCK_ZERO));
   2535     }
   2536 
   2537     *pnum -= offset - aligned_offset;
   2538     if (*pnum > bytes) {
   2539         *pnum = bytes;
   2540     }
   2541     if (ret & BDRV_BLOCK_OFFSET_VALID) {
   2542         local_map += offset - aligned_offset;
   2543     }
   2544 
   2545     if (ret & BDRV_BLOCK_RAW) {
   2546         assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
   2547         ret = bdrv_co_block_status(local_file, want_zero, local_map,
   2548                                    *pnum, pnum, &local_map, &local_file);
   2549         goto out;
   2550     }
   2551 
   2552     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
   2553         ret |= BDRV_BLOCK_ALLOCATED;
   2554     } else if (bs->drv->supports_backing) {
   2555         BlockDriverState *cow_bs = bdrv_cow_bs(bs);
   2556 
   2557         if (!cow_bs) {
   2558             ret |= BDRV_BLOCK_ZERO;
   2559         } else if (want_zero) {
   2560             int64_t size2 = bdrv_getlength(cow_bs);
   2561 
   2562             if (size2 >= 0 && offset >= size2) {
   2563                 ret |= BDRV_BLOCK_ZERO;
   2564             }
   2565         }
   2566     }
   2567 
   2568     if (want_zero && ret & BDRV_BLOCK_RECURSE &&
   2569         local_file && local_file != bs &&
   2570         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
   2571         (ret & BDRV_BLOCK_OFFSET_VALID)) {
   2572         int64_t file_pnum;
   2573         int ret2;
   2574 
   2575         ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
   2576                                     *pnum, &file_pnum, NULL, NULL);
   2577         if (ret2 >= 0) {
   2578             /* Ignore errors.  This is just providing extra information, it
   2579              * is useful but not necessary.
   2580              */
   2581             if (ret2 & BDRV_BLOCK_EOF &&
   2582                 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
   2583                 /*
   2584                  * It is valid for the format block driver to read
   2585                  * beyond the end of the underlying file's current
   2586                  * size; such areas read as zero.
   2587                  */
   2588                 ret |= BDRV_BLOCK_ZERO;
   2589             } else {
   2590                 /* Limit request to the range reported by the protocol driver */
   2591                 *pnum = file_pnum;
   2592                 ret |= (ret2 & BDRV_BLOCK_ZERO);
   2593             }
   2594         }
   2595     }
   2596 
   2597 out:
   2598     bdrv_dec_in_flight(bs);
   2599     if (ret >= 0 && offset + *pnum == total_size) {
   2600         ret |= BDRV_BLOCK_EOF;
   2601     }
   2602 early_out:
   2603     if (file) {
   2604         *file = local_file;
   2605     }
   2606     if (map) {
   2607         *map = local_map;
   2608     }
   2609     return ret;
   2610 }
   2611 
   2612 int coroutine_fn
   2613 bdrv_co_common_block_status_above(BlockDriverState *bs,
   2614                                   BlockDriverState *base,
   2615                                   bool include_base,
   2616                                   bool want_zero,
   2617                                   int64_t offset,
   2618                                   int64_t bytes,
   2619                                   int64_t *pnum,
   2620                                   int64_t *map,
   2621                                   BlockDriverState **file,
   2622                                   int *depth)
   2623 {
   2624     int ret;
   2625     BlockDriverState *p;
   2626     int64_t eof = 0;
   2627     int dummy;
   2628     IO_CODE();
   2629 
   2630     assert(!include_base || base); /* Can't include NULL base */
   2631 
   2632     if (!depth) {
   2633         depth = &dummy;
   2634     }
   2635     *depth = 0;
   2636 
   2637     if (!include_base && bs == base) {
   2638         *pnum = bytes;
   2639         return 0;
   2640     }
   2641 
   2642     ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file);
   2643     ++*depth;
   2644     if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) {
   2645         return ret;
   2646     }
   2647 
   2648     if (ret & BDRV_BLOCK_EOF) {
   2649         eof = offset + *pnum;
   2650     }
   2651 
   2652     assert(*pnum <= bytes);
   2653     bytes = *pnum;
   2654 
   2655     for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base;
   2656          p = bdrv_filter_or_cow_bs(p))
   2657     {
   2658         ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
   2659                                    file);
   2660         ++*depth;
   2661         if (ret < 0) {
   2662             return ret;
   2663         }
   2664         if (*pnum == 0) {
   2665             /*
   2666              * The top layer deferred to this layer, and because this layer is
   2667              * short, any zeroes that we synthesize beyond EOF behave as if they
   2668              * were allocated at this layer.
   2669              *
   2670              * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be
   2671              * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
   2672              * below.
   2673              */
   2674             assert(ret & BDRV_BLOCK_EOF);
   2675             *pnum = bytes;
   2676             if (file) {
   2677                 *file = p;
   2678             }
   2679             ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED;
   2680             break;
   2681         }
   2682         if (ret & BDRV_BLOCK_ALLOCATED) {
   2683             /*
   2684              * We've found the node and the status, we must break.
   2685              *
   2686              * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be
   2687              * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
   2688              * below.
   2689              */
   2690             ret &= ~BDRV_BLOCK_EOF;
   2691             break;
   2692         }
   2693 
   2694         if (p == base) {
   2695             assert(include_base);
   2696             break;
   2697         }
   2698 
   2699         /*
   2700          * OK, [offset, offset + *pnum) region is unallocated on this layer,
   2701          * let's continue the diving.
   2702          */
   2703         assert(*pnum <= bytes);
   2704         bytes = *pnum;
   2705     }
   2706 
   2707     if (offset + *pnum == eof) {
   2708         ret |= BDRV_BLOCK_EOF;
   2709     }
   2710 
   2711     return ret;
   2712 }
   2713 
   2714 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
   2715                             int64_t offset, int64_t bytes, int64_t *pnum,
   2716                             int64_t *map, BlockDriverState **file)
   2717 {
   2718     IO_CODE();
   2719     return bdrv_common_block_status_above(bs, base, false, true, offset, bytes,
   2720                                           pnum, map, file, NULL);
   2721 }
   2722 
   2723 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
   2724                       int64_t *pnum, int64_t *map, BlockDriverState **file)
   2725 {
   2726     IO_CODE();
   2727     return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs),
   2728                                    offset, bytes, pnum, map, file);
   2729 }
   2730 
   2731 /*
   2732  * Check @bs (and its backing chain) to see if the range defined
   2733  * by @offset and @bytes is known to read as zeroes.
   2734  * Return 1 if that is the case, 0 otherwise and -errno on error.
   2735  * This test is meant to be fast rather than accurate so returning 0
   2736  * does not guarantee non-zero data.
   2737  */
   2738 int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
   2739                                       int64_t bytes)
   2740 {
   2741     int ret;
   2742     int64_t pnum = bytes;
   2743     IO_CODE();
   2744 
   2745     if (!bytes) {
   2746         return 1;
   2747     }
   2748 
   2749     ret = bdrv_co_common_block_status_above(bs, NULL, false, false, offset,
   2750                                             bytes, &pnum, NULL, NULL, NULL);
   2751 
   2752     if (ret < 0) {
   2753         return ret;
   2754     }
   2755 
   2756     return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO);
   2757 }
   2758 
   2759 int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
   2760                       int64_t *pnum)
   2761 {
   2762     int ret;
   2763     int64_t dummy;
   2764     IO_CODE();
   2765 
   2766     ret = bdrv_common_block_status_above(bs, bs, true, false, offset,
   2767                                          bytes, pnum ? pnum : &dummy, NULL,
   2768                                          NULL, NULL);
   2769     if (ret < 0) {
   2770         return ret;
   2771     }
   2772     return !!(ret & BDRV_BLOCK_ALLOCATED);
   2773 }
   2774 
   2775 /*
   2776  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
   2777  *
   2778  * Return a positive depth if (a prefix of) the given range is allocated
   2779  * in any image between BASE and TOP (BASE is only included if include_base
   2780  * is set).  Depth 1 is TOP, 2 is the first backing layer, and so forth.
   2781  * BASE can be NULL to check if the given offset is allocated in any
   2782  * image of the chain.  Return 0 otherwise, or negative errno on
   2783  * failure.
   2784  *
   2785  * 'pnum' is set to the number of bytes (including and immediately
   2786  * following the specified offset) that are known to be in the same
   2787  * allocated/unallocated state.  Note that a subsequent call starting
   2788  * at 'offset + *pnum' may return the same allocation status (in other
   2789  * words, the result is not necessarily the maximum possible range);
   2790  * but 'pnum' will only be 0 when end of file is reached.
   2791  */
   2792 int bdrv_is_allocated_above(BlockDriverState *top,
   2793                             BlockDriverState *base,
   2794                             bool include_base, int64_t offset,
   2795                             int64_t bytes, int64_t *pnum)
   2796 {
   2797     int depth;
   2798     int ret = bdrv_common_block_status_above(top, base, include_base, false,
   2799                                              offset, bytes, pnum, NULL, NULL,
   2800                                              &depth);
   2801     IO_CODE();
   2802     if (ret < 0) {
   2803         return ret;
   2804     }
   2805 
   2806     if (ret & BDRV_BLOCK_ALLOCATED) {
   2807         return depth;
   2808     }
   2809     return 0;
   2810 }
   2811 
   2812 int coroutine_fn
   2813 bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
   2814 {
   2815     BlockDriver *drv = bs->drv;
   2816     BlockDriverState *child_bs = bdrv_primary_bs(bs);
   2817     int ret;
   2818     IO_CODE();
   2819 
   2820     ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
   2821     if (ret < 0) {
   2822         return ret;
   2823     }
   2824 
   2825     if (!drv) {
   2826         return -ENOMEDIUM;
   2827     }
   2828 
   2829     bdrv_inc_in_flight(bs);
   2830 
   2831     if (drv->bdrv_load_vmstate) {
   2832         ret = drv->bdrv_load_vmstate(bs, qiov, pos);
   2833     } else if (child_bs) {
   2834         ret = bdrv_co_readv_vmstate(child_bs, qiov, pos);
   2835     } else {
   2836         ret = -ENOTSUP;
   2837     }
   2838 
   2839     bdrv_dec_in_flight(bs);
   2840 
   2841     return ret;
   2842 }
   2843 
   2844 int coroutine_fn
   2845 bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
   2846 {
   2847     BlockDriver *drv = bs->drv;
   2848     BlockDriverState *child_bs = bdrv_primary_bs(bs);
   2849     int ret;
   2850     IO_CODE();
   2851 
   2852     ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
   2853     if (ret < 0) {
   2854         return ret;
   2855     }
   2856 
   2857     if (!drv) {
   2858         return -ENOMEDIUM;
   2859     }
   2860 
   2861     bdrv_inc_in_flight(bs);
   2862 
   2863     if (drv->bdrv_save_vmstate) {
   2864         ret = drv->bdrv_save_vmstate(bs, qiov, pos);
   2865     } else if (child_bs) {
   2866         ret = bdrv_co_writev_vmstate(child_bs, qiov, pos);
   2867     } else {
   2868         ret = -ENOTSUP;
   2869     }
   2870 
   2871     bdrv_dec_in_flight(bs);
   2872 
   2873     return ret;
   2874 }
   2875 
   2876 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
   2877                       int64_t pos, int size)
   2878 {
   2879     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
   2880     int ret = bdrv_writev_vmstate(bs, &qiov, pos);
   2881     IO_CODE();
   2882 
   2883     return ret < 0 ? ret : size;
   2884 }
   2885 
   2886 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
   2887                       int64_t pos, int size)
   2888 {
   2889     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
   2890     int ret = bdrv_readv_vmstate(bs, &qiov, pos);
   2891     IO_CODE();
   2892 
   2893     return ret < 0 ? ret : size;
   2894 }
   2895 
   2896 /**************************************************************/
   2897 /* async I/Os */
   2898 
   2899 void bdrv_aio_cancel(BlockAIOCB *acb)
   2900 {
   2901     IO_CODE();
   2902     qemu_aio_ref(acb);
   2903     bdrv_aio_cancel_async(acb);
   2904     while (acb->refcnt > 1) {
   2905         if (acb->aiocb_info->get_aio_context) {
   2906             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
   2907         } else if (acb->bs) {
   2908             /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
   2909              * assert that we're not using an I/O thread.  Thread-safe
   2910              * code should use bdrv_aio_cancel_async exclusively.
   2911              */
   2912             assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
   2913             aio_poll(bdrv_get_aio_context(acb->bs), true);
   2914         } else {
   2915             abort();
   2916         }
   2917     }
   2918     qemu_aio_unref(acb);
   2919 }
   2920 
   2921 /* Async version of aio cancel. The caller is not blocked if the acb implements
   2922  * cancel_async, otherwise we do nothing and let the request normally complete.
   2923  * In either case the completion callback must be called. */
   2924 void bdrv_aio_cancel_async(BlockAIOCB *acb)
   2925 {
   2926     IO_CODE();
   2927     if (acb->aiocb_info->cancel_async) {
   2928         acb->aiocb_info->cancel_async(acb);
   2929     }
   2930 }
   2931 
   2932 /**************************************************************/
   2933 /* Coroutine block device emulation */
   2934 
   2935 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
   2936 {
   2937     BdrvChild *primary_child = bdrv_primary_child(bs);
   2938     BdrvChild *child;
   2939     int current_gen;
   2940     int ret = 0;
   2941     IO_CODE();
   2942 
   2943     bdrv_inc_in_flight(bs);
   2944 
   2945     if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
   2946         bdrv_is_sg(bs)) {
   2947         goto early_exit;
   2948     }
   2949 
   2950     qemu_co_mutex_lock(&bs->reqs_lock);
   2951     current_gen = qatomic_read(&bs->write_gen);
   2952 
   2953     /* Wait until any previous flushes are completed */
   2954     while (bs->active_flush_req) {
   2955         qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
   2956     }
   2957 
   2958     /* Flushes reach this point in nondecreasing current_gen order.  */
   2959     bs->active_flush_req = true;
   2960     qemu_co_mutex_unlock(&bs->reqs_lock);
   2961 
   2962     /* Write back all layers by calling one driver function */
   2963     if (bs->drv->bdrv_co_flush) {
   2964         ret = bs->drv->bdrv_co_flush(bs);
   2965         goto out;
   2966     }
   2967 
   2968     /* Write back cached data to the OS even with cache=unsafe */
   2969     BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS);
   2970     if (bs->drv->bdrv_co_flush_to_os) {
   2971         ret = bs->drv->bdrv_co_flush_to_os(bs);
   2972         if (ret < 0) {
   2973             goto out;
   2974         }
   2975     }
   2976 
   2977     /* But don't actually force it to the disk with cache=unsafe */
   2978     if (bs->open_flags & BDRV_O_NO_FLUSH) {
   2979         goto flush_children;
   2980     }
   2981 
   2982     /* Check if we really need to flush anything */
   2983     if (bs->flushed_gen == current_gen) {
   2984         goto flush_children;
   2985     }
   2986 
   2987     BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK);
   2988     if (!bs->drv) {
   2989         /* bs->drv->bdrv_co_flush() might have ejected the BDS
   2990          * (even in case of apparent success) */
   2991         ret = -ENOMEDIUM;
   2992         goto out;
   2993     }
   2994     if (bs->drv->bdrv_co_flush_to_disk) {
   2995         ret = bs->drv->bdrv_co_flush_to_disk(bs);
   2996     } else if (bs->drv->bdrv_aio_flush) {
   2997         BlockAIOCB *acb;
   2998         CoroutineIOCompletion co = {
   2999             .coroutine = qemu_coroutine_self(),
   3000         };
   3001 
   3002         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
   3003         if (acb == NULL) {
   3004             ret = -EIO;
   3005         } else {
   3006             qemu_coroutine_yield();
   3007             ret = co.ret;
   3008         }
   3009     } else {
   3010         /*
   3011          * Some block drivers always operate in either writethrough or unsafe
   3012          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
   3013          * know how the server works (because the behaviour is hardcoded or
   3014          * depends on server-side configuration), so we can't ensure that
   3015          * everything is safe on disk. Returning an error doesn't work because
   3016          * that would break guests even if the server operates in writethrough
   3017          * mode.
   3018          *
   3019          * Let's hope the user knows what he's doing.
   3020          */
   3021         ret = 0;
   3022     }
   3023 
   3024     if (ret < 0) {
   3025         goto out;
   3026     }
   3027 
   3028     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
   3029      * in the case of cache=unsafe, so there are no useless flushes.
   3030      */
   3031 flush_children:
   3032     ret = 0;
   3033     QLIST_FOREACH(child, &bs->children, next) {
   3034         if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
   3035             int this_child_ret = bdrv_co_flush(child->bs);
   3036             if (!ret) {
   3037                 ret = this_child_ret;
   3038             }
   3039         }
   3040     }
   3041 
   3042 out:
   3043     /* Notify any pending flushes that we have completed */
   3044     if (ret == 0) {
   3045         bs->flushed_gen = current_gen;
   3046     }
   3047 
   3048     qemu_co_mutex_lock(&bs->reqs_lock);
   3049     bs->active_flush_req = false;
   3050     /* Return value is ignored - it's ok if wait queue is empty */
   3051     qemu_co_queue_next(&bs->flush_queue);
   3052     qemu_co_mutex_unlock(&bs->reqs_lock);
   3053 
   3054 early_exit:
   3055     bdrv_dec_in_flight(bs);
   3056     return ret;
   3057 }
   3058 
   3059 int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
   3060                                   int64_t bytes)
   3061 {
   3062     BdrvTrackedRequest req;
   3063     int ret;
   3064     int64_t max_pdiscard;
   3065     int head, tail, align;
   3066     BlockDriverState *bs = child->bs;
   3067     IO_CODE();
   3068 
   3069     if (!bs || !bs->drv || !bdrv_is_inserted(bs)) {
   3070         return -ENOMEDIUM;
   3071     }
   3072 
   3073     if (bdrv_has_readonly_bitmaps(bs)) {
   3074         return -EPERM;
   3075     }
   3076 
   3077     ret = bdrv_check_request(offset, bytes, NULL);
   3078     if (ret < 0) {
   3079         return ret;
   3080     }
   3081 
   3082     /* Do nothing if disabled.  */
   3083     if (!(bs->open_flags & BDRV_O_UNMAP)) {
   3084         return 0;
   3085     }
   3086 
   3087     if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
   3088         return 0;
   3089     }
   3090 
   3091     /* Invalidate the cached block-status data range if this discard overlaps */
   3092     bdrv_bsc_invalidate_range(bs, offset, bytes);
   3093 
   3094     /* Discard is advisory, but some devices track and coalesce
   3095      * unaligned requests, so we must pass everything down rather than
   3096      * round here.  Still, most devices will just silently ignore
   3097      * unaligned requests (by returning -ENOTSUP), so we must fragment
   3098      * the request accordingly.  */
   3099     align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
   3100     assert(align % bs->bl.request_alignment == 0);
   3101     head = offset % align;
   3102     tail = (offset + bytes) % align;
   3103 
   3104     bdrv_inc_in_flight(bs);
   3105     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
   3106 
   3107     ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
   3108     if (ret < 0) {
   3109         goto out;
   3110     }
   3111 
   3112     max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT64_MAX),
   3113                                    align);
   3114     assert(max_pdiscard >= bs->bl.request_alignment);
   3115 
   3116     while (bytes > 0) {
   3117         int64_t num = bytes;
   3118 
   3119         if (head) {
   3120             /* Make small requests to get to alignment boundaries. */
   3121             num = MIN(bytes, align - head);
   3122             if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
   3123                 num %= bs->bl.request_alignment;
   3124             }
   3125             head = (head + num) % align;
   3126             assert(num < max_pdiscard);
   3127         } else if (tail) {
   3128             if (num > align) {
   3129                 /* Shorten the request to the last aligned cluster.  */
   3130                 num -= tail;
   3131             } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
   3132                        tail > bs->bl.request_alignment) {
   3133                 tail %= bs->bl.request_alignment;
   3134                 num -= tail;
   3135             }
   3136         }
   3137         /* limit request size */
   3138         if (num > max_pdiscard) {
   3139             num = max_pdiscard;
   3140         }
   3141 
   3142         if (!bs->drv) {
   3143             ret = -ENOMEDIUM;
   3144             goto out;
   3145         }
   3146         if (bs->drv->bdrv_co_pdiscard) {
   3147             ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
   3148         } else {
   3149             BlockAIOCB *acb;
   3150             CoroutineIOCompletion co = {
   3151                 .coroutine = qemu_coroutine_self(),
   3152             };
   3153 
   3154             acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
   3155                                              bdrv_co_io_em_complete, &co);
   3156             if (acb == NULL) {
   3157                 ret = -EIO;
   3158                 goto out;
   3159             } else {
   3160                 qemu_coroutine_yield();
   3161                 ret = co.ret;
   3162             }
   3163         }
   3164         if (ret && ret != -ENOTSUP) {
   3165             goto out;
   3166         }
   3167 
   3168         offset += num;
   3169         bytes -= num;
   3170     }
   3171     ret = 0;
   3172 out:
   3173     bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
   3174     tracked_request_end(&req);
   3175     bdrv_dec_in_flight(bs);
   3176     return ret;
   3177 }
   3178 
   3179 int coroutine_fn bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
   3180 {
   3181     BlockDriver *drv = bs->drv;
   3182     CoroutineIOCompletion co = {
   3183         .coroutine = qemu_coroutine_self(),
   3184     };
   3185     BlockAIOCB *acb;
   3186     IO_CODE();
   3187 
   3188     bdrv_inc_in_flight(bs);
   3189     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
   3190         co.ret = -ENOTSUP;
   3191         goto out;
   3192     }
   3193 
   3194     if (drv->bdrv_co_ioctl) {
   3195         co.ret = drv->bdrv_co_ioctl(bs, req, buf);
   3196     } else {
   3197         acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
   3198         if (!acb) {
   3199             co.ret = -ENOTSUP;
   3200             goto out;
   3201         }
   3202         qemu_coroutine_yield();
   3203     }
   3204 out:
   3205     bdrv_dec_in_flight(bs);
   3206     return co.ret;
   3207 }
   3208 
   3209 void *qemu_blockalign(BlockDriverState *bs, size_t size)
   3210 {
   3211     IO_CODE();
   3212     return qemu_memalign(bdrv_opt_mem_align(bs), size);
   3213 }
   3214 
   3215 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
   3216 {
   3217     IO_CODE();
   3218     return memset(qemu_blockalign(bs, size), 0, size);
   3219 }
   3220 
   3221 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
   3222 {
   3223     size_t align = bdrv_opt_mem_align(bs);
   3224     IO_CODE();
   3225 
   3226     /* Ensure that NULL is never returned on success */
   3227     assert(align > 0);
   3228     if (size == 0) {
   3229         size = align;
   3230     }
   3231 
   3232     return qemu_try_memalign(align, size);
   3233 }
   3234 
   3235 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
   3236 {
   3237     void *mem = qemu_try_blockalign(bs, size);
   3238     IO_CODE();
   3239 
   3240     if (mem) {
   3241         memset(mem, 0, size);
   3242     }
   3243 
   3244     return mem;
   3245 }
   3246 
   3247 void bdrv_io_plug(BlockDriverState *bs)
   3248 {
   3249     BdrvChild *child;
   3250     IO_CODE();
   3251 
   3252     QLIST_FOREACH(child, &bs->children, next) {
   3253         bdrv_io_plug(child->bs);
   3254     }
   3255 
   3256     if (qatomic_fetch_inc(&bs->io_plugged) == 0) {
   3257         BlockDriver *drv = bs->drv;
   3258         if (drv && drv->bdrv_io_plug) {
   3259             drv->bdrv_io_plug(bs);
   3260         }
   3261     }
   3262 }
   3263 
   3264 void bdrv_io_unplug(BlockDriverState *bs)
   3265 {
   3266     BdrvChild *child;
   3267     IO_CODE();
   3268 
   3269     assert(bs->io_plugged);
   3270     if (qatomic_fetch_dec(&bs->io_plugged) == 1) {
   3271         BlockDriver *drv = bs->drv;
   3272         if (drv && drv->bdrv_io_unplug) {
   3273             drv->bdrv_io_unplug(bs);
   3274         }
   3275     }
   3276 
   3277     QLIST_FOREACH(child, &bs->children, next) {
   3278         bdrv_io_unplug(child->bs);
   3279     }
   3280 }
   3281 
   3282 /* Helper that undoes bdrv_register_buf() when it fails partway through */
   3283 static void bdrv_register_buf_rollback(BlockDriverState *bs,
   3284                                        void *host,
   3285                                        size_t size,
   3286                                        BdrvChild *final_child)
   3287 {
   3288     BdrvChild *child;
   3289 
   3290     QLIST_FOREACH(child, &bs->children, next) {
   3291         if (child == final_child) {
   3292             break;
   3293         }
   3294 
   3295         bdrv_unregister_buf(child->bs, host, size);
   3296     }
   3297 
   3298     if (bs->drv && bs->drv->bdrv_unregister_buf) {
   3299         bs->drv->bdrv_unregister_buf(bs, host, size);
   3300     }
   3301 }
   3302 
   3303 bool bdrv_register_buf(BlockDriverState *bs, void *host, size_t size,
   3304                        Error **errp)
   3305 {
   3306     BdrvChild *child;
   3307 
   3308     GLOBAL_STATE_CODE();
   3309     if (bs->drv && bs->drv->bdrv_register_buf) {
   3310         if (!bs->drv->bdrv_register_buf(bs, host, size, errp)) {
   3311             return false;
   3312         }
   3313     }
   3314     QLIST_FOREACH(child, &bs->children, next) {
   3315         if (!bdrv_register_buf(child->bs, host, size, errp)) {
   3316             bdrv_register_buf_rollback(bs, host, size, child);
   3317             return false;
   3318         }
   3319     }
   3320     return true;
   3321 }
   3322 
   3323 void bdrv_unregister_buf(BlockDriverState *bs, void *host, size_t size)
   3324 {
   3325     BdrvChild *child;
   3326 
   3327     GLOBAL_STATE_CODE();
   3328     if (bs->drv && bs->drv->bdrv_unregister_buf) {
   3329         bs->drv->bdrv_unregister_buf(bs, host, size);
   3330     }
   3331     QLIST_FOREACH(child, &bs->children, next) {
   3332         bdrv_unregister_buf(child->bs, host, size);
   3333     }
   3334 }
   3335 
   3336 static int coroutine_fn bdrv_co_copy_range_internal(
   3337         BdrvChild *src, int64_t src_offset, BdrvChild *dst,
   3338         int64_t dst_offset, int64_t bytes,
   3339         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
   3340         bool recurse_src)
   3341 {
   3342     BdrvTrackedRequest req;
   3343     int ret;
   3344 
   3345     /* TODO We can support BDRV_REQ_NO_FALLBACK here */
   3346     assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
   3347     assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
   3348     assert(!(read_flags & BDRV_REQ_NO_WAIT));
   3349     assert(!(write_flags & BDRV_REQ_NO_WAIT));
   3350 
   3351     if (!dst || !dst->bs || !bdrv_is_inserted(dst->bs)) {
   3352         return -ENOMEDIUM;
   3353     }
   3354     ret = bdrv_check_request32(dst_offset, bytes, NULL, 0);
   3355     if (ret) {
   3356         return ret;
   3357     }
   3358     if (write_flags & BDRV_REQ_ZERO_WRITE) {
   3359         return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
   3360     }
   3361 
   3362     if (!src || !src->bs || !bdrv_is_inserted(src->bs)) {
   3363         return -ENOMEDIUM;
   3364     }
   3365     ret = bdrv_check_request32(src_offset, bytes, NULL, 0);
   3366     if (ret) {
   3367         return ret;
   3368     }
   3369 
   3370     if (!src->bs->drv->bdrv_co_copy_range_from
   3371         || !dst->bs->drv->bdrv_co_copy_range_to
   3372         || src->bs->encrypted || dst->bs->encrypted) {
   3373         return -ENOTSUP;
   3374     }
   3375 
   3376     if (recurse_src) {
   3377         bdrv_inc_in_flight(src->bs);
   3378         tracked_request_begin(&req, src->bs, src_offset, bytes,
   3379                               BDRV_TRACKED_READ);
   3380 
   3381         /* BDRV_REQ_SERIALISING is only for write operation */
   3382         assert(!(read_flags & BDRV_REQ_SERIALISING));
   3383         bdrv_wait_serialising_requests(&req);
   3384 
   3385         ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
   3386                                                     src, src_offset,
   3387                                                     dst, dst_offset,
   3388                                                     bytes,
   3389                                                     read_flags, write_flags);
   3390 
   3391         tracked_request_end(&req);
   3392         bdrv_dec_in_flight(src->bs);
   3393     } else {
   3394         bdrv_inc_in_flight(dst->bs);
   3395         tracked_request_begin(&req, dst->bs, dst_offset, bytes,
   3396                               BDRV_TRACKED_WRITE);
   3397         ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
   3398                                         write_flags);
   3399         if (!ret) {
   3400             ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
   3401                                                       src, src_offset,
   3402                                                       dst, dst_offset,
   3403                                                       bytes,
   3404                                                       read_flags, write_flags);
   3405         }
   3406         bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
   3407         tracked_request_end(&req);
   3408         bdrv_dec_in_flight(dst->bs);
   3409     }
   3410 
   3411     return ret;
   3412 }
   3413 
   3414 /* Copy range from @src to @dst.
   3415  *
   3416  * See the comment of bdrv_co_copy_range for the parameter and return value
   3417  * semantics. */
   3418 int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset,
   3419                                          BdrvChild *dst, int64_t dst_offset,
   3420                                          int64_t bytes,
   3421                                          BdrvRequestFlags read_flags,
   3422                                          BdrvRequestFlags write_flags)
   3423 {
   3424     IO_CODE();
   3425     trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
   3426                                   read_flags, write_flags);
   3427     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
   3428                                        bytes, read_flags, write_flags, true);
   3429 }
   3430 
   3431 /* Copy range from @src to @dst.
   3432  *
   3433  * See the comment of bdrv_co_copy_range for the parameter and return value
   3434  * semantics. */
   3435 int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset,
   3436                                        BdrvChild *dst, int64_t dst_offset,
   3437                                        int64_t bytes,
   3438                                        BdrvRequestFlags read_flags,
   3439                                        BdrvRequestFlags write_flags)
   3440 {
   3441     IO_CODE();
   3442     trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
   3443                                 read_flags, write_flags);
   3444     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
   3445                                        bytes, read_flags, write_flags, false);
   3446 }
   3447 
   3448 int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
   3449                                     BdrvChild *dst, int64_t dst_offset,
   3450                                     int64_t bytes, BdrvRequestFlags read_flags,
   3451                                     BdrvRequestFlags write_flags)
   3452 {
   3453     IO_CODE();
   3454     return bdrv_co_copy_range_from(src, src_offset,
   3455                                    dst, dst_offset,
   3456                                    bytes, read_flags, write_flags);
   3457 }
   3458 
   3459 static void bdrv_parent_cb_resize(BlockDriverState *bs)
   3460 {
   3461     BdrvChild *c;
   3462     QLIST_FOREACH(c, &bs->parents, next_parent) {
   3463         if (c->klass->resize) {
   3464             c->klass->resize(c);
   3465         }
   3466     }
   3467 }
   3468 
   3469 /**
   3470  * Truncate file to 'offset' bytes (needed only for file protocols)
   3471  *
   3472  * If 'exact' is true, the file must be resized to exactly the given
   3473  * 'offset'.  Otherwise, it is sufficient for the node to be at least
   3474  * 'offset' bytes in length.
   3475  */
   3476 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
   3477                                   PreallocMode prealloc, BdrvRequestFlags flags,
   3478                                   Error **errp)
   3479 {
   3480     BlockDriverState *bs = child->bs;
   3481     BdrvChild *filtered, *backing;
   3482     BlockDriver *drv = bs->drv;
   3483     BdrvTrackedRequest req;
   3484     int64_t old_size, new_bytes;
   3485     int ret;
   3486     IO_CODE();
   3487 
   3488     /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
   3489     if (!drv) {
   3490         error_setg(errp, "No medium inserted");
   3491         return -ENOMEDIUM;
   3492     }
   3493     if (offset < 0) {
   3494         error_setg(errp, "Image size cannot be negative");
   3495         return -EINVAL;
   3496     }
   3497 
   3498     ret = bdrv_check_request(offset, 0, errp);
   3499     if (ret < 0) {
   3500         return ret;
   3501     }
   3502 
   3503     old_size = bdrv_getlength(bs);
   3504     if (old_size < 0) {
   3505         error_setg_errno(errp, -old_size, "Failed to get old image size");
   3506         return old_size;
   3507     }
   3508 
   3509     if (bdrv_is_read_only(bs)) {
   3510         error_setg(errp, "Image is read-only");
   3511         return -EACCES;
   3512     }
   3513 
   3514     if (offset > old_size) {
   3515         new_bytes = offset - old_size;
   3516     } else {
   3517         new_bytes = 0;
   3518     }
   3519 
   3520     bdrv_inc_in_flight(bs);
   3521     tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
   3522                           BDRV_TRACKED_TRUNCATE);
   3523 
   3524     /* If we are growing the image and potentially using preallocation for the
   3525      * new area, we need to make sure that no write requests are made to it
   3526      * concurrently or they might be overwritten by preallocation. */
   3527     if (new_bytes) {
   3528         bdrv_make_request_serialising(&req, 1);
   3529     }
   3530     ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
   3531                                     0);
   3532     if (ret < 0) {
   3533         error_setg_errno(errp, -ret,
   3534                          "Failed to prepare request for truncation");
   3535         goto out;
   3536     }
   3537 
   3538     filtered = bdrv_filter_child(bs);
   3539     backing = bdrv_cow_child(bs);
   3540 
   3541     /*
   3542      * If the image has a backing file that is large enough that it would
   3543      * provide data for the new area, we cannot leave it unallocated because
   3544      * then the backing file content would become visible. Instead, zero-fill
   3545      * the new area.
   3546      *
   3547      * Note that if the image has a backing file, but was opened without the
   3548      * backing file, taking care of keeping things consistent with that backing
   3549      * file is the user's responsibility.
   3550      */
   3551     if (new_bytes && backing) {
   3552         int64_t backing_len;
   3553 
   3554         backing_len = bdrv_getlength(backing->bs);
   3555         if (backing_len < 0) {
   3556             ret = backing_len;
   3557             error_setg_errno(errp, -ret, "Could not get backing file size");
   3558             goto out;
   3559         }
   3560 
   3561         if (backing_len > old_size) {
   3562             flags |= BDRV_REQ_ZERO_WRITE;
   3563         }
   3564     }
   3565 
   3566     if (drv->bdrv_co_truncate) {
   3567         if (flags & ~bs->supported_truncate_flags) {
   3568             error_setg(errp, "Block driver does not support requested flags");
   3569             ret = -ENOTSUP;
   3570             goto out;
   3571         }
   3572         ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
   3573     } else if (filtered) {
   3574         ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp);
   3575     } else {
   3576         error_setg(errp, "Image format driver does not support resize");
   3577         ret = -ENOTSUP;
   3578         goto out;
   3579     }
   3580     if (ret < 0) {
   3581         goto out;
   3582     }
   3583 
   3584     ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
   3585     if (ret < 0) {
   3586         error_setg_errno(errp, -ret, "Could not refresh total sector count");
   3587     } else {
   3588         offset = bs->total_sectors * BDRV_SECTOR_SIZE;
   3589     }
   3590     /* It's possible that truncation succeeded but refresh_total_sectors
   3591      * failed, but the latter doesn't affect how we should finish the request.
   3592      * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
   3593     bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
   3594 
   3595 out:
   3596     tracked_request_end(&req);
   3597     bdrv_dec_in_flight(bs);
   3598 
   3599     return ret;
   3600 }
   3601 
   3602 void bdrv_cancel_in_flight(BlockDriverState *bs)
   3603 {
   3604     GLOBAL_STATE_CODE();
   3605     if (!bs || !bs->drv) {
   3606         return;
   3607     }
   3608 
   3609     if (bs->drv->bdrv_cancel_in_flight) {
   3610         bs->drv->bdrv_cancel_in_flight(bs);
   3611     }
   3612 }
   3613 
   3614 int coroutine_fn
   3615 bdrv_co_preadv_snapshot(BdrvChild *child, int64_t offset, int64_t bytes,
   3616                         QEMUIOVector *qiov, size_t qiov_offset)
   3617 {
   3618     BlockDriverState *bs = child->bs;
   3619     BlockDriver *drv = bs->drv;
   3620     int ret;
   3621     IO_CODE();
   3622 
   3623     if (!drv) {
   3624         return -ENOMEDIUM;
   3625     }
   3626 
   3627     if (!drv->bdrv_co_preadv_snapshot) {
   3628         return -ENOTSUP;
   3629     }
   3630 
   3631     bdrv_inc_in_flight(bs);
   3632     ret = drv->bdrv_co_preadv_snapshot(bs, offset, bytes, qiov, qiov_offset);
   3633     bdrv_dec_in_flight(bs);
   3634 
   3635     return ret;
   3636 }
   3637 
   3638 int coroutine_fn
   3639 bdrv_co_snapshot_block_status(BlockDriverState *bs,
   3640                               bool want_zero, int64_t offset, int64_t bytes,
   3641                               int64_t *pnum, int64_t *map,
   3642                               BlockDriverState **file)
   3643 {
   3644     BlockDriver *drv = bs->drv;
   3645     int ret;
   3646     IO_CODE();
   3647 
   3648     if (!drv) {
   3649         return -ENOMEDIUM;
   3650     }
   3651 
   3652     if (!drv->bdrv_co_snapshot_block_status) {
   3653         return -ENOTSUP;
   3654     }
   3655 
   3656     bdrv_inc_in_flight(bs);
   3657     ret = drv->bdrv_co_snapshot_block_status(bs, want_zero, offset, bytes,
   3658                                              pnum, map, file);
   3659     bdrv_dec_in_flight(bs);
   3660 
   3661     return ret;
   3662 }
   3663 
   3664 int coroutine_fn
   3665 bdrv_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
   3666 {
   3667     BlockDriver *drv = bs->drv;
   3668     int ret;
   3669     IO_CODE();
   3670 
   3671     if (!drv) {
   3672         return -ENOMEDIUM;
   3673     }
   3674 
   3675     if (!drv->bdrv_co_pdiscard_snapshot) {
   3676         return -ENOTSUP;
   3677     }
   3678 
   3679     bdrv_inc_in_flight(bs);
   3680     ret = drv->bdrv_co_pdiscard_snapshot(bs, offset, bytes);
   3681     bdrv_dec_in_flight(bs);
   3682 
   3683     return ret;
   3684 }