qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

copy-before-write.c (16561B)


      1 /*
      2  * copy-before-write filter driver
      3  *
      4  * The driver performs Copy-Before-Write (CBW) operation: it is injected above
      5  * some node, and before each write it copies _old_ data to the target node.
      6  *
      7  * Copyright (c) 2018-2021 Virtuozzo International GmbH.
      8  *
      9  * Author:
     10  *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
     11  *
     12  * This program is free software; you can redistribute it and/or modify
     13  * it under the terms of the GNU General Public License as published by
     14  * the Free Software Foundation; either version 2 of the License, or
     15  * (at your option) any later version.
     16  *
     17  * This program is distributed in the hope that it will be useful,
     18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     20  * GNU General Public License for more details.
     21  *
     22  * You should have received a copy of the GNU General Public License
     23  * along with this program. If not, see <http://www.gnu.org/licenses/>.
     24  */
     25 
     26 #include "qemu/osdep.h"
     27 #include "qapi/qmp/qjson.h"
     28 
     29 #include "sysemu/block-backend.h"
     30 #include "qemu/cutils.h"
     31 #include "qapi/error.h"
     32 #include "block/block_int.h"
     33 #include "block/qdict.h"
     34 #include "block/block-copy.h"
     35 
     36 #include "block/copy-before-write.h"
     37 #include "block/reqlist.h"
     38 
     39 #include "qapi/qapi-visit-block-core.h"
     40 
     41 typedef struct BDRVCopyBeforeWriteState {
     42     BlockCopyState *bcs;
     43     BdrvChild *target;
     44     OnCbwError on_cbw_error;
     45     uint32_t cbw_timeout_ns;
     46 
     47     /*
     48      * @lock: protects access to @access_bitmap, @done_bitmap and
     49      * @frozen_read_reqs
     50      */
     51     CoMutex lock;
     52 
     53     /*
     54      * @access_bitmap: represents areas allowed for reading by fleecing user.
     55      * Reading from non-dirty areas leads to -EACCES.
     56      */
     57     BdrvDirtyBitmap *access_bitmap;
     58 
     59     /*
     60      * @done_bitmap: represents areas that was successfully copied to @target by
     61      * copy-before-write operations.
     62      */
     63     BdrvDirtyBitmap *done_bitmap;
     64 
     65     /*
     66      * @frozen_read_reqs: current read requests for fleecing user in bs->file
     67      * node. These areas must not be rewritten by guest.
     68      */
     69     BlockReqList frozen_read_reqs;
     70 
     71     /*
     72      * @snapshot_error is normally zero. But on first copy-before-write failure
     73      * when @on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT, @snapshot_error takes
     74      * value of this error (<0). After that all in-flight and further
     75      * snapshot-API requests will fail with that error.
     76      */
     77     int snapshot_error;
     78 } BDRVCopyBeforeWriteState;
     79 
     80 static coroutine_fn int cbw_co_preadv(
     81         BlockDriverState *bs, int64_t offset, int64_t bytes,
     82         QEMUIOVector *qiov, BdrvRequestFlags flags)
     83 {
     84     return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
     85 }
     86 
     87 static void block_copy_cb(void *opaque)
     88 {
     89     BlockDriverState *bs = opaque;
     90 
     91     bdrv_dec_in_flight(bs);
     92 }
     93 
     94 /*
     95  * Do copy-before-write operation.
     96  *
     97  * On failure guest request must be failed too.
     98  *
     99  * On success, we also wait for all in-flight fleecing read requests in source
    100  * node, and it's guaranteed that after cbw_do_copy_before_write() successful
    101  * return there are no such requests and they will never appear.
    102  */
    103 static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs,
    104         uint64_t offset, uint64_t bytes, BdrvRequestFlags flags)
    105 {
    106     BDRVCopyBeforeWriteState *s = bs->opaque;
    107     int ret;
    108     uint64_t off, end;
    109     int64_t cluster_size = block_copy_cluster_size(s->bcs);
    110 
    111     if (flags & BDRV_REQ_WRITE_UNCHANGED) {
    112         return 0;
    113     }
    114 
    115     if (s->snapshot_error) {
    116         return 0;
    117     }
    118 
    119     off = QEMU_ALIGN_DOWN(offset, cluster_size);
    120     end = QEMU_ALIGN_UP(offset + bytes, cluster_size);
    121 
    122     /*
    123      * Increase in_flight, so that in case of timed-out block-copy, the
    124      * remaining background block_copy() request (which can't be immediately
    125      * cancelled by timeout) is presented in bs->in_flight. This way we are
    126      * sure that on bs close() we'll previously wait for all timed-out but yet
    127      * running block_copy calls.
    128      */
    129     bdrv_inc_in_flight(bs);
    130     ret = block_copy(s->bcs, off, end - off, true, s->cbw_timeout_ns,
    131                      block_copy_cb, bs);
    132     if (ret < 0 && s->on_cbw_error == ON_CBW_ERROR_BREAK_GUEST_WRITE) {
    133         return ret;
    134     }
    135 
    136     WITH_QEMU_LOCK_GUARD(&s->lock) {
    137         if (ret < 0) {
    138             assert(s->on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT);
    139             if (!s->snapshot_error) {
    140                 s->snapshot_error = ret;
    141             }
    142         } else {
    143             bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off);
    144         }
    145         reqlist_wait_all(&s->frozen_read_reqs, off, end - off, &s->lock);
    146     }
    147 
    148     return 0;
    149 }
    150 
    151 static int coroutine_fn cbw_co_pdiscard(BlockDriverState *bs,
    152                                         int64_t offset, int64_t bytes)
    153 {
    154     int ret = cbw_do_copy_before_write(bs, offset, bytes, 0);
    155     if (ret < 0) {
    156         return ret;
    157     }
    158 
    159     return bdrv_co_pdiscard(bs->file, offset, bytes);
    160 }
    161 
    162 static int coroutine_fn cbw_co_pwrite_zeroes(BlockDriverState *bs,
    163         int64_t offset, int64_t bytes, BdrvRequestFlags flags)
    164 {
    165     int ret = cbw_do_copy_before_write(bs, offset, bytes, flags);
    166     if (ret < 0) {
    167         return ret;
    168     }
    169 
    170     return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
    171 }
    172 
    173 static coroutine_fn int cbw_co_pwritev(BlockDriverState *bs,
    174                                        int64_t offset,
    175                                        int64_t bytes,
    176                                        QEMUIOVector *qiov,
    177                                        BdrvRequestFlags flags)
    178 {
    179     int ret = cbw_do_copy_before_write(bs, offset, bytes, flags);
    180     if (ret < 0) {
    181         return ret;
    182     }
    183 
    184     return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
    185 }
    186 
    187 static int coroutine_fn cbw_co_flush(BlockDriverState *bs)
    188 {
    189     if (!bs->file) {
    190         return 0;
    191     }
    192 
    193     return bdrv_co_flush(bs->file->bs);
    194 }
    195 
    196 /*
    197  * If @offset not accessible - return NULL.
    198  *
    199  * Otherwise, set @pnum to some bytes that accessible from @file (@file is set
    200  * to bs->file or to s->target). Return newly allocated BlockReq object that
    201  * should be than passed to cbw_snapshot_read_unlock().
    202  *
    203  * It's guaranteed that guest writes will not interact in the region until
    204  * cbw_snapshot_read_unlock() called.
    205  */
    206 static coroutine_fn BlockReq *
    207 cbw_snapshot_read_lock(BlockDriverState *bs, int64_t offset, int64_t bytes,
    208                        int64_t *pnum, BdrvChild **file)
    209 {
    210     BDRVCopyBeforeWriteState *s = bs->opaque;
    211     BlockReq *req = g_new(BlockReq, 1);
    212     bool done;
    213 
    214     QEMU_LOCK_GUARD(&s->lock);
    215 
    216     if (s->snapshot_error) {
    217         g_free(req);
    218         return NULL;
    219     }
    220 
    221     if (bdrv_dirty_bitmap_next_zero(s->access_bitmap, offset, bytes) != -1) {
    222         g_free(req);
    223         return NULL;
    224     }
    225 
    226     done = bdrv_dirty_bitmap_status(s->done_bitmap, offset, bytes, pnum);
    227     if (done) {
    228         /*
    229          * Special invalid BlockReq, that is handled in
    230          * cbw_snapshot_read_unlock(). We don't need to lock something to read
    231          * from s->target.
    232          */
    233         *req = (BlockReq) {.offset = -1, .bytes = -1};
    234         *file = s->target;
    235     } else {
    236         reqlist_init_req(&s->frozen_read_reqs, req, offset, bytes);
    237         *file = bs->file;
    238     }
    239 
    240     return req;
    241 }
    242 
    243 static coroutine_fn void
    244 cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req)
    245 {
    246     BDRVCopyBeforeWriteState *s = bs->opaque;
    247 
    248     if (req->offset == -1 && req->bytes == -1) {
    249         g_free(req);
    250         return;
    251     }
    252 
    253     QEMU_LOCK_GUARD(&s->lock);
    254 
    255     reqlist_remove_req(req);
    256     g_free(req);
    257 }
    258 
    259 static coroutine_fn int
    260 cbw_co_preadv_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes,
    261                        QEMUIOVector *qiov, size_t qiov_offset)
    262 {
    263     BlockReq *req;
    264     BdrvChild *file;
    265     int ret;
    266 
    267     /* TODO: upgrade to async loop using AioTask */
    268     while (bytes) {
    269         int64_t cur_bytes;
    270 
    271         req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &file);
    272         if (!req) {
    273             return -EACCES;
    274         }
    275 
    276         ret = bdrv_co_preadv_part(file, offset, cur_bytes,
    277                                   qiov, qiov_offset, 0);
    278         cbw_snapshot_read_unlock(bs, req);
    279         if (ret < 0) {
    280             return ret;
    281         }
    282 
    283         bytes -= cur_bytes;
    284         offset += cur_bytes;
    285         qiov_offset += cur_bytes;
    286     }
    287 
    288     return 0;
    289 }
    290 
    291 static int coroutine_fn
    292 cbw_co_snapshot_block_status(BlockDriverState *bs,
    293                              bool want_zero, int64_t offset, int64_t bytes,
    294                              int64_t *pnum, int64_t *map,
    295                              BlockDriverState **file)
    296 {
    297     BDRVCopyBeforeWriteState *s = bs->opaque;
    298     BlockReq *req;
    299     int ret;
    300     int64_t cur_bytes;
    301     BdrvChild *child;
    302 
    303     req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &child);
    304     if (!req) {
    305         return -EACCES;
    306     }
    307 
    308     ret = bdrv_block_status(child->bs, offset, cur_bytes, pnum, map, file);
    309     if (child == s->target) {
    310         /*
    311          * We refer to s->target only for areas that we've written to it.
    312          * And we can not report unallocated blocks in s->target: this will
    313          * break generic block-status-above logic, that will go to
    314          * copy-before-write filtered child in this case.
    315          */
    316         assert(ret & BDRV_BLOCK_ALLOCATED);
    317     }
    318 
    319     cbw_snapshot_read_unlock(bs, req);
    320 
    321     return ret;
    322 }
    323 
    324 static int coroutine_fn cbw_co_pdiscard_snapshot(BlockDriverState *bs,
    325                                                  int64_t offset, int64_t bytes)
    326 {
    327     BDRVCopyBeforeWriteState *s = bs->opaque;
    328 
    329     WITH_QEMU_LOCK_GUARD(&s->lock) {
    330         bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes);
    331     }
    332 
    333     block_copy_reset(s->bcs, offset, bytes);
    334 
    335     return bdrv_co_pdiscard(s->target, offset, bytes);
    336 }
    337 
    338 static void cbw_refresh_filename(BlockDriverState *bs)
    339 {
    340     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
    341             bs->file->bs->filename);
    342 }
    343 
    344 static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c,
    345                            BdrvChildRole role,
    346                            BlockReopenQueue *reopen_queue,
    347                            uint64_t perm, uint64_t shared,
    348                            uint64_t *nperm, uint64_t *nshared)
    349 {
    350     if (!(role & BDRV_CHILD_FILTERED)) {
    351         /*
    352          * Target child
    353          *
    354          * Share write to target (child_file), to not interfere
    355          * with guest writes to its disk which may be in target backing chain.
    356          * Can't resize during a backup block job because we check the size
    357          * only upfront.
    358          */
    359         *nshared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
    360         *nperm = BLK_PERM_WRITE;
    361     } else {
    362         /* Source child */
    363         bdrv_default_perms(bs, c, role, reopen_queue,
    364                            perm, shared, nperm, nshared);
    365 
    366         if (!QLIST_EMPTY(&bs->parents)) {
    367             if (perm & BLK_PERM_WRITE) {
    368                 *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
    369             }
    370             *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
    371         }
    372     }
    373 }
    374 
    375 static BlockdevOptions *cbw_parse_options(QDict *options, Error **errp)
    376 {
    377     BlockdevOptions *opts = NULL;
    378     Visitor *v = NULL;
    379 
    380     qdict_put_str(options, "driver", "copy-before-write");
    381 
    382     v = qobject_input_visitor_new_flat_confused(options, errp);
    383     if (!v) {
    384         goto out;
    385     }
    386 
    387     visit_type_BlockdevOptions(v, NULL, &opts, errp);
    388     if (!opts) {
    389         goto out;
    390     }
    391 
    392     /*
    393      * Delete options which we are going to parse through BlockdevOptions
    394      * object for original options.
    395      */
    396     qdict_extract_subqdict(options, NULL, "bitmap");
    397     qdict_del(options, "on-cbw-error");
    398     qdict_del(options, "cbw-timeout");
    399 
    400 out:
    401     visit_free(v);
    402     qdict_del(options, "driver");
    403 
    404     return opts;
    405 }
    406 
    407 static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
    408                     Error **errp)
    409 {
    410     BDRVCopyBeforeWriteState *s = bs->opaque;
    411     BdrvDirtyBitmap *bitmap = NULL;
    412     int64_t cluster_size;
    413     g_autoptr(BlockdevOptions) full_opts = NULL;
    414     BlockdevOptionsCbw *opts;
    415     int ret;
    416 
    417     full_opts = cbw_parse_options(options, errp);
    418     if (!full_opts) {
    419         return -EINVAL;
    420     }
    421     assert(full_opts->driver == BLOCKDEV_DRIVER_COPY_BEFORE_WRITE);
    422     opts = &full_opts->u.copy_before_write;
    423 
    424     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
    425     if (ret < 0) {
    426         return ret;
    427     }
    428 
    429     s->target = bdrv_open_child(NULL, options, "target", bs, &child_of_bds,
    430                                 BDRV_CHILD_DATA, false, errp);
    431     if (!s->target) {
    432         return -EINVAL;
    433     }
    434 
    435     if (opts->has_bitmap) {
    436         bitmap = block_dirty_bitmap_lookup(opts->bitmap->node,
    437                                            opts->bitmap->name, NULL, errp);
    438         if (!bitmap) {
    439             return -EINVAL;
    440         }
    441     }
    442     s->on_cbw_error = opts->has_on_cbw_error ? opts->on_cbw_error :
    443             ON_CBW_ERROR_BREAK_GUEST_WRITE;
    444     s->cbw_timeout_ns = opts->has_cbw_timeout ?
    445         opts->cbw_timeout * NANOSECONDS_PER_SECOND : 0;
    446 
    447     bs->total_sectors = bs->file->bs->total_sectors;
    448     bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
    449             (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
    450     bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
    451             ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
    452              bs->file->bs->supported_zero_flags);
    453 
    454     s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
    455     if (!s->bcs) {
    456         error_prepend(errp, "Cannot create block-copy-state: ");
    457         return -EINVAL;
    458     }
    459 
    460     cluster_size = block_copy_cluster_size(s->bcs);
    461 
    462     s->done_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
    463     if (!s->done_bitmap) {
    464         return -EINVAL;
    465     }
    466     bdrv_disable_dirty_bitmap(s->done_bitmap);
    467 
    468     /* s->access_bitmap starts equal to bcs bitmap */
    469     s->access_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
    470     if (!s->access_bitmap) {
    471         return -EINVAL;
    472     }
    473     bdrv_disable_dirty_bitmap(s->access_bitmap);
    474     bdrv_dirty_bitmap_merge_internal(s->access_bitmap,
    475                                      block_copy_dirty_bitmap(s->bcs), NULL,
    476                                      true);
    477 
    478     qemu_co_mutex_init(&s->lock);
    479     QLIST_INIT(&s->frozen_read_reqs);
    480 
    481     return 0;
    482 }
    483 
    484 static void cbw_close(BlockDriverState *bs)
    485 {
    486     BDRVCopyBeforeWriteState *s = bs->opaque;
    487 
    488     bdrv_release_dirty_bitmap(s->access_bitmap);
    489     bdrv_release_dirty_bitmap(s->done_bitmap);
    490 
    491     block_copy_state_free(s->bcs);
    492     s->bcs = NULL;
    493 }
    494 
    495 BlockDriver bdrv_cbw_filter = {
    496     .format_name = "copy-before-write",
    497     .instance_size = sizeof(BDRVCopyBeforeWriteState),
    498 
    499     .bdrv_open                  = cbw_open,
    500     .bdrv_close                 = cbw_close,
    501 
    502     .bdrv_co_preadv             = cbw_co_preadv,
    503     .bdrv_co_pwritev            = cbw_co_pwritev,
    504     .bdrv_co_pwrite_zeroes      = cbw_co_pwrite_zeroes,
    505     .bdrv_co_pdiscard           = cbw_co_pdiscard,
    506     .bdrv_co_flush              = cbw_co_flush,
    507 
    508     .bdrv_co_preadv_snapshot       = cbw_co_preadv_snapshot,
    509     .bdrv_co_pdiscard_snapshot     = cbw_co_pdiscard_snapshot,
    510     .bdrv_co_snapshot_block_status = cbw_co_snapshot_block_status,
    511 
    512     .bdrv_refresh_filename      = cbw_refresh_filename,
    513 
    514     .bdrv_child_perm            = cbw_child_perm,
    515 
    516     .is_filter = true,
    517 };
    518 
    519 BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
    520                                   BlockDriverState *target,
    521                                   const char *filter_node_name,
    522                                   BlockCopyState **bcs,
    523                                   Error **errp)
    524 {
    525     ERRP_GUARD();
    526     BDRVCopyBeforeWriteState *state;
    527     BlockDriverState *top;
    528     QDict *opts;
    529 
    530     assert(source->total_sectors == target->total_sectors);
    531     GLOBAL_STATE_CODE();
    532 
    533     opts = qdict_new();
    534     qdict_put_str(opts, "driver", "copy-before-write");
    535     if (filter_node_name) {
    536         qdict_put_str(opts, "node-name", filter_node_name);
    537     }
    538     qdict_put_str(opts, "file", bdrv_get_node_name(source));
    539     qdict_put_str(opts, "target", bdrv_get_node_name(target));
    540 
    541     top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp);
    542     if (!top) {
    543         return NULL;
    544     }
    545 
    546     state = top->opaque;
    547     *bcs = state->bcs;
    548 
    549     return top;
    550 }
    551 
    552 void bdrv_cbw_drop(BlockDriverState *bs)
    553 {
    554     GLOBAL_STATE_CODE();
    555     bdrv_drop_filter(bs, &error_abort);
    556     bdrv_unref(bs);
    557 }
    558 
    559 static void cbw_init(void)
    560 {
    561     bdrv_register(&bdrv_cbw_filter);
    562 }
    563 
    564 block_init(cbw_init);