qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

block.c (245584B)


      1 /*
      2  * QEMU System Emulator block driver
      3  *
      4  * Copyright (c) 2003 Fabrice Bellard
      5  * Copyright (c) 2020 Virtuozzo International GmbH.
      6  *
      7  * Permission is hereby granted, free of charge, to any person obtaining a copy
      8  * of this software and associated documentation files (the "Software"), to deal
      9  * in the Software without restriction, including without limitation the rights
     10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     11  * copies of the Software, and to permit persons to whom the Software is
     12  * furnished to do so, subject to the following conditions:
     13  *
     14  * The above copyright notice and this permission notice shall be included in
     15  * all copies or substantial portions of the Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     23  * THE SOFTWARE.
     24  */
     25 
     26 #include "qemu/osdep.h"
     27 #include "block/trace.h"
     28 #include "block/block_int.h"
     29 #include "block/blockjob.h"
     30 #include "block/fuse.h"
     31 #include "block/nbd.h"
     32 #include "block/qdict.h"
     33 #include "qemu/error-report.h"
     34 #include "block/module_block.h"
     35 #include "qemu/main-loop.h"
     36 #include "qemu/module.h"
     37 #include "qapi/error.h"
     38 #include "qapi/qmp/qdict.h"
     39 #include "qapi/qmp/qjson.h"
     40 #include "qapi/qmp/qnull.h"
     41 #include "qapi/qmp/qstring.h"
     42 #include "qapi/qobject-output-visitor.h"
     43 #include "qapi/qapi-visit-block-core.h"
     44 #include "sysemu/block-backend.h"
     45 #include "qemu/notify.h"
     46 #include "qemu/option.h"
     47 #include "qemu/coroutine.h"
     48 #include "block/qapi.h"
     49 #include "qemu/timer.h"
     50 #include "qemu/cutils.h"
     51 #include "qemu/id.h"
     52 #include "qemu/range.h"
     53 #include "qemu/rcu.h"
     54 #include "block/coroutines.h"
     55 
     56 #ifdef CONFIG_BSD
     57 #include <sys/ioctl.h>
     58 #include <sys/queue.h>
     59 #if defined(HAVE_SYS_DISK_H)
     60 #include <sys/disk.h>
     61 #endif
     62 #endif
     63 
     64 #ifdef _WIN32
     65 #include <windows.h>
     66 #endif
     67 
     68 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
     69 
     70 /* Protected by BQL */
     71 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
     72     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
     73 
     74 /* Protected by BQL */
     75 static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
     76     QTAILQ_HEAD_INITIALIZER(all_bdrv_states);
     77 
     78 /* Protected by BQL */
     79 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
     80     QLIST_HEAD_INITIALIZER(bdrv_drivers);
     81 
     82 static BlockDriverState *bdrv_open_inherit(const char *filename,
     83                                            const char *reference,
     84                                            QDict *options, int flags,
     85                                            BlockDriverState *parent,
     86                                            const BdrvChildClass *child_class,
     87                                            BdrvChildRole child_role,
     88                                            Error **errp);
     89 
     90 static bool bdrv_recurse_has_child(BlockDriverState *bs,
     91                                    BlockDriverState *child);
     92 
     93 static void bdrv_replace_child_noperm(BdrvChild *child,
     94                                       BlockDriverState *new_bs);
     95 static void bdrv_remove_child(BdrvChild *child, Transaction *tran);
     96 static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
     97                                             Transaction *tran);
     98 
     99 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
    100                                BlockReopenQueue *queue,
    101                                Transaction *change_child_tran, Error **errp);
    102 static void bdrv_reopen_commit(BDRVReopenState *reopen_state);
    103 static void bdrv_reopen_abort(BDRVReopenState *reopen_state);
    104 
    105 static bool bdrv_backing_overridden(BlockDriverState *bs);
    106 
    107 static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
    108                                     GHashTable *visited, Transaction *tran,
    109                                     Error **errp);
    110 
    111 /* If non-zero, use only whitelisted block drivers */
    112 static int use_bdrv_whitelist;
    113 
    114 #ifdef _WIN32
    115 static int is_windows_drive_prefix(const char *filename)
    116 {
    117     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
    118              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
    119             filename[1] == ':');
    120 }
    121 
    122 int is_windows_drive(const char *filename)
    123 {
    124     if (is_windows_drive_prefix(filename) &&
    125         filename[2] == '\0')
    126         return 1;
    127     if (strstart(filename, "\\\\.\\", NULL) ||
    128         strstart(filename, "//./", NULL))
    129         return 1;
    130     return 0;
    131 }
    132 #endif
    133 
    134 size_t bdrv_opt_mem_align(BlockDriverState *bs)
    135 {
    136     if (!bs || !bs->drv) {
    137         /* page size or 4k (hdd sector size) should be on the safe side */
    138         return MAX(4096, qemu_real_host_page_size());
    139     }
    140     IO_CODE();
    141 
    142     return bs->bl.opt_mem_alignment;
    143 }
    144 
    145 size_t bdrv_min_mem_align(BlockDriverState *bs)
    146 {
    147     if (!bs || !bs->drv) {
    148         /* page size or 4k (hdd sector size) should be on the safe side */
    149         return MAX(4096, qemu_real_host_page_size());
    150     }
    151     IO_CODE();
    152 
    153     return bs->bl.min_mem_alignment;
    154 }
    155 
    156 /* check if the path starts with "<protocol>:" */
    157 int path_has_protocol(const char *path)
    158 {
    159     const char *p;
    160 
    161 #ifdef _WIN32
    162     if (is_windows_drive(path) ||
    163         is_windows_drive_prefix(path)) {
    164         return 0;
    165     }
    166     p = path + strcspn(path, ":/\\");
    167 #else
    168     p = path + strcspn(path, ":/");
    169 #endif
    170 
    171     return *p == ':';
    172 }
    173 
    174 int path_is_absolute(const char *path)
    175 {
    176 #ifdef _WIN32
    177     /* specific case for names like: "\\.\d:" */
    178     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
    179         return 1;
    180     }
    181     return (*path == '/' || *path == '\\');
    182 #else
    183     return (*path == '/');
    184 #endif
    185 }
    186 
    187 /* if filename is absolute, just return its duplicate. Otherwise, build a
    188    path to it by considering it is relative to base_path. URL are
    189    supported. */
    190 char *path_combine(const char *base_path, const char *filename)
    191 {
    192     const char *protocol_stripped = NULL;
    193     const char *p, *p1;
    194     char *result;
    195     int len;
    196 
    197     if (path_is_absolute(filename)) {
    198         return g_strdup(filename);
    199     }
    200 
    201     if (path_has_protocol(base_path)) {
    202         protocol_stripped = strchr(base_path, ':');
    203         if (protocol_stripped) {
    204             protocol_stripped++;
    205         }
    206     }
    207     p = protocol_stripped ?: base_path;
    208 
    209     p1 = strrchr(base_path, '/');
    210 #ifdef _WIN32
    211     {
    212         const char *p2;
    213         p2 = strrchr(base_path, '\\');
    214         if (!p1 || p2 > p1) {
    215             p1 = p2;
    216         }
    217     }
    218 #endif
    219     if (p1) {
    220         p1++;
    221     } else {
    222         p1 = base_path;
    223     }
    224     if (p1 > p) {
    225         p = p1;
    226     }
    227     len = p - base_path;
    228 
    229     result = g_malloc(len + strlen(filename) + 1);
    230     memcpy(result, base_path, len);
    231     strcpy(result + len, filename);
    232 
    233     return result;
    234 }
    235 
    236 /*
    237  * Helper function for bdrv_parse_filename() implementations to remove optional
    238  * protocol prefixes (especially "file:") from a filename and for putting the
    239  * stripped filename into the options QDict if there is such a prefix.
    240  */
    241 void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
    242                                       QDict *options)
    243 {
    244     if (strstart(filename, prefix, &filename)) {
    245         /* Stripping the explicit protocol prefix may result in a protocol
    246          * prefix being (wrongly) detected (if the filename contains a colon) */
    247         if (path_has_protocol(filename)) {
    248             GString *fat_filename;
    249 
    250             /* This means there is some colon before the first slash; therefore,
    251              * this cannot be an absolute path */
    252             assert(!path_is_absolute(filename));
    253 
    254             /* And we can thus fix the protocol detection issue by prefixing it
    255              * by "./" */
    256             fat_filename = g_string_new("./");
    257             g_string_append(fat_filename, filename);
    258 
    259             assert(!path_has_protocol(fat_filename->str));
    260 
    261             qdict_put(options, "filename",
    262                       qstring_from_gstring(fat_filename));
    263         } else {
    264             /* If no protocol prefix was detected, we can use the shortened
    265              * filename as-is */
    266             qdict_put_str(options, "filename", filename);
    267         }
    268     }
    269 }
    270 
    271 
    272 /* Returns whether the image file is opened as read-only. Note that this can
    273  * return false and writing to the image file is still not possible because the
    274  * image is inactivated. */
    275 bool bdrv_is_read_only(BlockDriverState *bs)
    276 {
    277     IO_CODE();
    278     return !(bs->open_flags & BDRV_O_RDWR);
    279 }
    280 
    281 int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
    282                            bool ignore_allow_rdw, Error **errp)
    283 {
    284     IO_CODE();
    285 
    286     /* Do not set read_only if copy_on_read is enabled */
    287     if (bs->copy_on_read && read_only) {
    288         error_setg(errp, "Can't set node '%s' to r/o with copy-on-read enabled",
    289                    bdrv_get_device_or_node_name(bs));
    290         return -EINVAL;
    291     }
    292 
    293     /* Do not clear read_only if it is prohibited */
    294     if (!read_only && !(bs->open_flags & BDRV_O_ALLOW_RDWR) &&
    295         !ignore_allow_rdw)
    296     {
    297         error_setg(errp, "Node '%s' is read only",
    298                    bdrv_get_device_or_node_name(bs));
    299         return -EPERM;
    300     }
    301 
    302     return 0;
    303 }
    304 
    305 /*
    306  * Called by a driver that can only provide a read-only image.
    307  *
    308  * Returns 0 if the node is already read-only or it could switch the node to
    309  * read-only because BDRV_O_AUTO_RDONLY is set.
    310  *
    311  * Returns -EACCES if the node is read-write and BDRV_O_AUTO_RDONLY is not set
    312  * or bdrv_can_set_read_only() forbids making the node read-only. If @errmsg
    313  * is not NULL, it is used as the error message for the Error object.
    314  */
    315 int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
    316                               Error **errp)
    317 {
    318     int ret = 0;
    319     IO_CODE();
    320 
    321     if (!(bs->open_flags & BDRV_O_RDWR)) {
    322         return 0;
    323     }
    324     if (!(bs->open_flags & BDRV_O_AUTO_RDONLY)) {
    325         goto fail;
    326     }
    327 
    328     ret = bdrv_can_set_read_only(bs, true, false, NULL);
    329     if (ret < 0) {
    330         goto fail;
    331     }
    332 
    333     bs->open_flags &= ~BDRV_O_RDWR;
    334 
    335     return 0;
    336 
    337 fail:
    338     error_setg(errp, "%s", errmsg ?: "Image is read-only");
    339     return -EACCES;
    340 }
    341 
    342 /*
    343  * If @backing is empty, this function returns NULL without setting
    344  * @errp.  In all other cases, NULL will only be returned with @errp
    345  * set.
    346  *
    347  * Therefore, a return value of NULL without @errp set means that
    348  * there is no backing file; if @errp is set, there is one but its
    349  * absolute filename cannot be generated.
    350  */
    351 char *bdrv_get_full_backing_filename_from_filename(const char *backed,
    352                                                    const char *backing,
    353                                                    Error **errp)
    354 {
    355     if (backing[0] == '\0') {
    356         return NULL;
    357     } else if (path_has_protocol(backing) || path_is_absolute(backing)) {
    358         return g_strdup(backing);
    359     } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
    360         error_setg(errp, "Cannot use relative backing file names for '%s'",
    361                    backed);
    362         return NULL;
    363     } else {
    364         return path_combine(backed, backing);
    365     }
    366 }
    367 
    368 /*
    369  * If @filename is empty or NULL, this function returns NULL without
    370  * setting @errp.  In all other cases, NULL will only be returned with
    371  * @errp set.
    372  */
    373 static char *bdrv_make_absolute_filename(BlockDriverState *relative_to,
    374                                          const char *filename, Error **errp)
    375 {
    376     char *dir, *full_name;
    377 
    378     if (!filename || filename[0] == '\0') {
    379         return NULL;
    380     } else if (path_has_protocol(filename) || path_is_absolute(filename)) {
    381         return g_strdup(filename);
    382     }
    383 
    384     dir = bdrv_dirname(relative_to, errp);
    385     if (!dir) {
    386         return NULL;
    387     }
    388 
    389     full_name = g_strconcat(dir, filename, NULL);
    390     g_free(dir);
    391     return full_name;
    392 }
    393 
    394 char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp)
    395 {
    396     GLOBAL_STATE_CODE();
    397     return bdrv_make_absolute_filename(bs, bs->backing_file, errp);
    398 }
    399 
    400 void bdrv_register(BlockDriver *bdrv)
    401 {
    402     assert(bdrv->format_name);
    403     GLOBAL_STATE_CODE();
    404     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
    405 }
    406 
    407 BlockDriverState *bdrv_new(void)
    408 {
    409     BlockDriverState *bs;
    410     int i;
    411 
    412     GLOBAL_STATE_CODE();
    413 
    414     bs = g_new0(BlockDriverState, 1);
    415     QLIST_INIT(&bs->dirty_bitmaps);
    416     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
    417         QLIST_INIT(&bs->op_blockers[i]);
    418     }
    419     qemu_co_mutex_init(&bs->reqs_lock);
    420     qemu_mutex_init(&bs->dirty_bitmap_mutex);
    421     bs->refcnt = 1;
    422     bs->aio_context = qemu_get_aio_context();
    423 
    424     qemu_co_queue_init(&bs->flush_queue);
    425 
    426     qemu_co_mutex_init(&bs->bsc_modify_lock);
    427     bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1);
    428 
    429     for (i = 0; i < bdrv_drain_all_count; i++) {
    430         bdrv_drained_begin(bs);
    431     }
    432 
    433     QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
    434 
    435     return bs;
    436 }
    437 
    438 static BlockDriver *bdrv_do_find_format(const char *format_name)
    439 {
    440     BlockDriver *drv1;
    441     GLOBAL_STATE_CODE();
    442 
    443     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
    444         if (!strcmp(drv1->format_name, format_name)) {
    445             return drv1;
    446         }
    447     }
    448 
    449     return NULL;
    450 }
    451 
    452 BlockDriver *bdrv_find_format(const char *format_name)
    453 {
    454     BlockDriver *drv1;
    455     int i;
    456 
    457     GLOBAL_STATE_CODE();
    458 
    459     drv1 = bdrv_do_find_format(format_name);
    460     if (drv1) {
    461         return drv1;
    462     }
    463 
    464     /* The driver isn't registered, maybe we need to load a module */
    465     for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
    466         if (!strcmp(block_driver_modules[i].format_name, format_name)) {
    467             Error *local_err = NULL;
    468             int rv = block_module_load(block_driver_modules[i].library_name,
    469                                        &local_err);
    470             if (rv > 0) {
    471                 return bdrv_do_find_format(format_name);
    472             } else if (rv < 0) {
    473                 error_report_err(local_err);
    474             }
    475             break;
    476         }
    477     }
    478     return NULL;
    479 }
    480 
    481 static int bdrv_format_is_whitelisted(const char *format_name, bool read_only)
    482 {
    483     static const char *whitelist_rw[] = {
    484         CONFIG_BDRV_RW_WHITELIST
    485         NULL
    486     };
    487     static const char *whitelist_ro[] = {
    488         CONFIG_BDRV_RO_WHITELIST
    489         NULL
    490     };
    491     const char **p;
    492 
    493     if (!whitelist_rw[0] && !whitelist_ro[0]) {
    494         return 1;               /* no whitelist, anything goes */
    495     }
    496 
    497     for (p = whitelist_rw; *p; p++) {
    498         if (!strcmp(format_name, *p)) {
    499             return 1;
    500         }
    501     }
    502     if (read_only) {
    503         for (p = whitelist_ro; *p; p++) {
    504             if (!strcmp(format_name, *p)) {
    505                 return 1;
    506             }
    507         }
    508     }
    509     return 0;
    510 }
    511 
    512 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
    513 {
    514     GLOBAL_STATE_CODE();
    515     return bdrv_format_is_whitelisted(drv->format_name, read_only);
    516 }
    517 
    518 bool bdrv_uses_whitelist(void)
    519 {
    520     return use_bdrv_whitelist;
    521 }
    522 
    523 typedef struct CreateCo {
    524     BlockDriver *drv;
    525     char *filename;
    526     QemuOpts *opts;
    527     int ret;
    528     Error *err;
    529 } CreateCo;
    530 
    531 static void coroutine_fn bdrv_create_co_entry(void *opaque)
    532 {
    533     Error *local_err = NULL;
    534     int ret;
    535 
    536     CreateCo *cco = opaque;
    537     assert(cco->drv);
    538     GLOBAL_STATE_CODE();
    539 
    540     ret = cco->drv->bdrv_co_create_opts(cco->drv,
    541                                         cco->filename, cco->opts, &local_err);
    542     error_propagate(&cco->err, local_err);
    543     cco->ret = ret;
    544 }
    545 
    546 int bdrv_create(BlockDriver *drv, const char* filename,
    547                 QemuOpts *opts, Error **errp)
    548 {
    549     int ret;
    550 
    551     GLOBAL_STATE_CODE();
    552 
    553     Coroutine *co;
    554     CreateCo cco = {
    555         .drv = drv,
    556         .filename = g_strdup(filename),
    557         .opts = opts,
    558         .ret = NOT_DONE,
    559         .err = NULL,
    560     };
    561 
    562     if (!drv->bdrv_co_create_opts) {
    563         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
    564         ret = -ENOTSUP;
    565         goto out;
    566     }
    567 
    568     if (qemu_in_coroutine()) {
    569         /* Fast-path if already in coroutine context */
    570         bdrv_create_co_entry(&cco);
    571     } else {
    572         co = qemu_coroutine_create(bdrv_create_co_entry, &cco);
    573         qemu_coroutine_enter(co);
    574         while (cco.ret == NOT_DONE) {
    575             aio_poll(qemu_get_aio_context(), true);
    576         }
    577     }
    578 
    579     ret = cco.ret;
    580     if (ret < 0) {
    581         if (cco.err) {
    582             error_propagate(errp, cco.err);
    583         } else {
    584             error_setg_errno(errp, -ret, "Could not create image");
    585         }
    586     }
    587 
    588 out:
    589     g_free(cco.filename);
    590     return ret;
    591 }
    592 
    593 /**
    594  * Helper function for bdrv_create_file_fallback(): Resize @blk to at
    595  * least the given @minimum_size.
    596  *
    597  * On success, return @blk's actual length.
    598  * Otherwise, return -errno.
    599  */
    600 static int64_t create_file_fallback_truncate(BlockBackend *blk,
    601                                              int64_t minimum_size, Error **errp)
    602 {
    603     Error *local_err = NULL;
    604     int64_t size;
    605     int ret;
    606 
    607     GLOBAL_STATE_CODE();
    608 
    609     ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
    610                        &local_err);
    611     if (ret < 0 && ret != -ENOTSUP) {
    612         error_propagate(errp, local_err);
    613         return ret;
    614     }
    615 
    616     size = blk_getlength(blk);
    617     if (size < 0) {
    618         error_free(local_err);
    619         error_setg_errno(errp, -size,
    620                          "Failed to inquire the new image file's length");
    621         return size;
    622     }
    623 
    624     if (size < minimum_size) {
    625         /* Need to grow the image, but we failed to do that */
    626         error_propagate(errp, local_err);
    627         return -ENOTSUP;
    628     }
    629 
    630     error_free(local_err);
    631     local_err = NULL;
    632 
    633     return size;
    634 }
    635 
    636 /**
    637  * Helper function for bdrv_create_file_fallback(): Zero the first
    638  * sector to remove any potentially pre-existing image header.
    639  */
    640 static int coroutine_fn
    641 create_file_fallback_zero_first_sector(BlockBackend *blk,
    642                                        int64_t current_size,
    643                                        Error **errp)
    644 {
    645     int64_t bytes_to_clear;
    646     int ret;
    647 
    648     GLOBAL_STATE_CODE();
    649 
    650     bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE);
    651     if (bytes_to_clear) {
    652         ret = blk_co_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP);
    653         if (ret < 0) {
    654             error_setg_errno(errp, -ret,
    655                              "Failed to clear the new image's first sector");
    656             return ret;
    657         }
    658     }
    659 
    660     return 0;
    661 }
    662 
    663 /**
    664  * Simple implementation of bdrv_co_create_opts for protocol drivers
    665  * which only support creation via opening a file
    666  * (usually existing raw storage device)
    667  */
    668 int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
    669                                             const char *filename,
    670                                             QemuOpts *opts,
    671                                             Error **errp)
    672 {
    673     BlockBackend *blk;
    674     QDict *options;
    675     int64_t size = 0;
    676     char *buf = NULL;
    677     PreallocMode prealloc;
    678     Error *local_err = NULL;
    679     int ret;
    680 
    681     GLOBAL_STATE_CODE();
    682 
    683     size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
    684     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
    685     prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
    686                                PREALLOC_MODE_OFF, &local_err);
    687     g_free(buf);
    688     if (local_err) {
    689         error_propagate(errp, local_err);
    690         return -EINVAL;
    691     }
    692 
    693     if (prealloc != PREALLOC_MODE_OFF) {
    694         error_setg(errp, "Unsupported preallocation mode '%s'",
    695                    PreallocMode_str(prealloc));
    696         return -ENOTSUP;
    697     }
    698 
    699     options = qdict_new();
    700     qdict_put_str(options, "driver", drv->format_name);
    701 
    702     blk = blk_new_open(filename, NULL, options,
    703                        BDRV_O_RDWR | BDRV_O_RESIZE, errp);
    704     if (!blk) {
    705         error_prepend(errp, "Protocol driver '%s' does not support image "
    706                       "creation, and opening the image failed: ",
    707                       drv->format_name);
    708         return -EINVAL;
    709     }
    710 
    711     size = create_file_fallback_truncate(blk, size, errp);
    712     if (size < 0) {
    713         ret = size;
    714         goto out;
    715     }
    716 
    717     ret = create_file_fallback_zero_first_sector(blk, size, errp);
    718     if (ret < 0) {
    719         goto out;
    720     }
    721 
    722     ret = 0;
    723 out:
    724     blk_unref(blk);
    725     return ret;
    726 }
    727 
    728 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
    729 {
    730     QemuOpts *protocol_opts;
    731     BlockDriver *drv;
    732     QDict *qdict;
    733     int ret;
    734 
    735     GLOBAL_STATE_CODE();
    736 
    737     drv = bdrv_find_protocol(filename, true, errp);
    738     if (drv == NULL) {
    739         return -ENOENT;
    740     }
    741 
    742     if (!drv->create_opts) {
    743         error_setg(errp, "Driver '%s' does not support image creation",
    744                    drv->format_name);
    745         return -ENOTSUP;
    746     }
    747 
    748     /*
    749      * 'opts' contains a QemuOptsList with a combination of format and protocol
    750      * default values.
    751      *
    752      * The format properly removes its options, but the default values remain
    753      * in 'opts->list'.  So if the protocol has options with the same name
    754      * (e.g. rbd has 'cluster_size' as qcow2), it will see the default values
    755      * of the format, since for overlapping options, the format wins.
    756      *
    757      * To avoid this issue, lets convert QemuOpts to QDict, in this way we take
    758      * only the set options, and then convert it back to QemuOpts, using the
    759      * create_opts of the protocol. So the new QemuOpts, will contain only the
    760      * protocol defaults.
    761      */
    762     qdict = qemu_opts_to_qdict(opts, NULL);
    763     protocol_opts = qemu_opts_from_qdict(drv->create_opts, qdict, errp);
    764     if (protocol_opts == NULL) {
    765         ret = -EINVAL;
    766         goto out;
    767     }
    768 
    769     ret = bdrv_create(drv, filename, protocol_opts, errp);
    770 out:
    771     qemu_opts_del(protocol_opts);
    772     qobject_unref(qdict);
    773     return ret;
    774 }
    775 
    776 int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp)
    777 {
    778     Error *local_err = NULL;
    779     int ret;
    780 
    781     IO_CODE();
    782     assert(bs != NULL);
    783 
    784     if (!bs->drv) {
    785         error_setg(errp, "Block node '%s' is not opened", bs->filename);
    786         return -ENOMEDIUM;
    787     }
    788 
    789     if (!bs->drv->bdrv_co_delete_file) {
    790         error_setg(errp, "Driver '%s' does not support image deletion",
    791                    bs->drv->format_name);
    792         return -ENOTSUP;
    793     }
    794 
    795     ret = bs->drv->bdrv_co_delete_file(bs, &local_err);
    796     if (ret < 0) {
    797         error_propagate(errp, local_err);
    798     }
    799 
    800     return ret;
    801 }
    802 
    803 void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs)
    804 {
    805     Error *local_err = NULL;
    806     int ret;
    807     IO_CODE();
    808 
    809     if (!bs) {
    810         return;
    811     }
    812 
    813     ret = bdrv_co_delete_file(bs, &local_err);
    814     /*
    815      * ENOTSUP will happen if the block driver doesn't support
    816      * the 'bdrv_co_delete_file' interface. This is a predictable
    817      * scenario and shouldn't be reported back to the user.
    818      */
    819     if (ret == -ENOTSUP) {
    820         error_free(local_err);
    821     } else if (ret < 0) {
    822         error_report_err(local_err);
    823     }
    824 }
    825 
    826 /**
    827  * Try to get @bs's logical and physical block size.
    828  * On success, store them in @bsz struct and return 0.
    829  * On failure return -errno.
    830  * @bs must not be empty.
    831  */
    832 int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
    833 {
    834     BlockDriver *drv = bs->drv;
    835     BlockDriverState *filtered = bdrv_filter_bs(bs);
    836     GLOBAL_STATE_CODE();
    837 
    838     if (drv && drv->bdrv_probe_blocksizes) {
    839         return drv->bdrv_probe_blocksizes(bs, bsz);
    840     } else if (filtered) {
    841         return bdrv_probe_blocksizes(filtered, bsz);
    842     }
    843 
    844     return -ENOTSUP;
    845 }
    846 
    847 /**
    848  * Try to get @bs's geometry (cyls, heads, sectors).
    849  * On success, store them in @geo struct and return 0.
    850  * On failure return -errno.
    851  * @bs must not be empty.
    852  */
    853 int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
    854 {
    855     BlockDriver *drv = bs->drv;
    856     BlockDriverState *filtered = bdrv_filter_bs(bs);
    857     GLOBAL_STATE_CODE();
    858 
    859     if (drv && drv->bdrv_probe_geometry) {
    860         return drv->bdrv_probe_geometry(bs, geo);
    861     } else if (filtered) {
    862         return bdrv_probe_geometry(filtered, geo);
    863     }
    864 
    865     return -ENOTSUP;
    866 }
    867 
    868 /*
    869  * Create a uniquely-named empty temporary file.
    870  * Return the actual file name used upon success, otherwise NULL.
    871  * This string should be freed with g_free() when not needed any longer.
    872  *
    873  * Note: creating a temporary file for the caller to (re)open is
    874  * inherently racy. Use g_file_open_tmp() instead whenever practical.
    875  */
    876 char *create_tmp_file(Error **errp)
    877 {
    878     int fd;
    879     const char *tmpdir;
    880     g_autofree char *filename = NULL;
    881 
    882     tmpdir = g_get_tmp_dir();
    883 #ifndef _WIN32
    884     /*
    885      * See commit 69bef79 ("block: use /var/tmp instead of /tmp for -snapshot")
    886      *
    887      * This function is used to create temporary disk images (like -snapshot),
    888      * so the files can become very large. /tmp is often a tmpfs where as
    889      * /var/tmp is usually on a disk, so more appropriate for disk images.
    890      */
    891     if (!g_strcmp0(tmpdir, "/tmp")) {
    892         tmpdir = "/var/tmp";
    893     }
    894 #endif
    895 
    896     filename = g_strdup_printf("%s/vl.XXXXXX", tmpdir);
    897     fd = g_mkstemp(filename);
    898     if (fd < 0) {
    899         error_setg_errno(errp, errno, "Could not open temporary file '%s'",
    900                          filename);
    901         return NULL;
    902     }
    903     close(fd);
    904 
    905     return g_steal_pointer(&filename);
    906 }
    907 
    908 /*
    909  * Detect host devices. By convention, /dev/cdrom[N] is always
    910  * recognized as a host CDROM.
    911  */
    912 static BlockDriver *find_hdev_driver(const char *filename)
    913 {
    914     int score_max = 0, score;
    915     BlockDriver *drv = NULL, *d;
    916     GLOBAL_STATE_CODE();
    917 
    918     QLIST_FOREACH(d, &bdrv_drivers, list) {
    919         if (d->bdrv_probe_device) {
    920             score = d->bdrv_probe_device(filename);
    921             if (score > score_max) {
    922                 score_max = score;
    923                 drv = d;
    924             }
    925         }
    926     }
    927 
    928     return drv;
    929 }
    930 
    931 static BlockDriver *bdrv_do_find_protocol(const char *protocol)
    932 {
    933     BlockDriver *drv1;
    934     GLOBAL_STATE_CODE();
    935 
    936     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
    937         if (drv1->protocol_name && !strcmp(drv1->protocol_name, protocol)) {
    938             return drv1;
    939         }
    940     }
    941 
    942     return NULL;
    943 }
    944 
    945 BlockDriver *bdrv_find_protocol(const char *filename,
    946                                 bool allow_protocol_prefix,
    947                                 Error **errp)
    948 {
    949     BlockDriver *drv1;
    950     char protocol[128];
    951     int len;
    952     const char *p;
    953     int i;
    954 
    955     GLOBAL_STATE_CODE();
    956     /* TODO Drivers without bdrv_file_open must be specified explicitly */
    957 
    958     /*
    959      * XXX(hch): we really should not let host device detection
    960      * override an explicit protocol specification, but moving this
    961      * later breaks access to device names with colons in them.
    962      * Thanks to the brain-dead persistent naming schemes on udev-
    963      * based Linux systems those actually are quite common.
    964      */
    965     drv1 = find_hdev_driver(filename);
    966     if (drv1) {
    967         return drv1;
    968     }
    969 
    970     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
    971         return &bdrv_file;
    972     }
    973 
    974     p = strchr(filename, ':');
    975     assert(p != NULL);
    976     len = p - filename;
    977     if (len > sizeof(protocol) - 1)
    978         len = sizeof(protocol) - 1;
    979     memcpy(protocol, filename, len);
    980     protocol[len] = '\0';
    981 
    982     drv1 = bdrv_do_find_protocol(protocol);
    983     if (drv1) {
    984         return drv1;
    985     }
    986 
    987     for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
    988         if (block_driver_modules[i].protocol_name &&
    989             !strcmp(block_driver_modules[i].protocol_name, protocol)) {
    990             int rv = block_module_load(block_driver_modules[i].library_name, errp);
    991             if (rv > 0) {
    992                 drv1 = bdrv_do_find_protocol(protocol);
    993             } else if (rv < 0) {
    994                 return NULL;
    995             }
    996             break;
    997         }
    998     }
    999 
   1000     if (!drv1) {
   1001         error_setg(errp, "Unknown protocol '%s'", protocol);
   1002     }
   1003     return drv1;
   1004 }
   1005 
   1006 /*
   1007  * Guess image format by probing its contents.
   1008  * This is not a good idea when your image is raw (CVE-2008-2004), but
   1009  * we do it anyway for backward compatibility.
   1010  *
   1011  * @buf         contains the image's first @buf_size bytes.
   1012  * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
   1013  *              but can be smaller if the image file is smaller)
   1014  * @filename    is its filename.
   1015  *
   1016  * For all block drivers, call the bdrv_probe() method to get its
   1017  * probing score.
   1018  * Return the first block driver with the highest probing score.
   1019  */
   1020 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
   1021                             const char *filename)
   1022 {
   1023     int score_max = 0, score;
   1024     BlockDriver *drv = NULL, *d;
   1025     IO_CODE();
   1026 
   1027     QLIST_FOREACH(d, &bdrv_drivers, list) {
   1028         if (d->bdrv_probe) {
   1029             score = d->bdrv_probe(buf, buf_size, filename);
   1030             if (score > score_max) {
   1031                 score_max = score;
   1032                 drv = d;
   1033             }
   1034         }
   1035     }
   1036 
   1037     return drv;
   1038 }
   1039 
   1040 static int find_image_format(BlockBackend *file, const char *filename,
   1041                              BlockDriver **pdrv, Error **errp)
   1042 {
   1043     BlockDriver *drv;
   1044     uint8_t buf[BLOCK_PROBE_BUF_SIZE];
   1045     int ret = 0;
   1046 
   1047     GLOBAL_STATE_CODE();
   1048 
   1049     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
   1050     if (blk_is_sg(file) || !blk_is_inserted(file) || blk_getlength(file) == 0) {
   1051         *pdrv = &bdrv_raw;
   1052         return ret;
   1053     }
   1054 
   1055     ret = blk_pread(file, 0, sizeof(buf), buf, 0);
   1056     if (ret < 0) {
   1057         error_setg_errno(errp, -ret, "Could not read image for determining its "
   1058                          "format");
   1059         *pdrv = NULL;
   1060         return ret;
   1061     }
   1062 
   1063     drv = bdrv_probe_all(buf, sizeof(buf), filename);
   1064     if (!drv) {
   1065         error_setg(errp, "Could not determine image format: No compatible "
   1066                    "driver found");
   1067         *pdrv = NULL;
   1068         return -ENOENT;
   1069     }
   1070 
   1071     *pdrv = drv;
   1072     return 0;
   1073 }
   1074 
   1075 /**
   1076  * Set the current 'total_sectors' value
   1077  * Return 0 on success, -errno on error.
   1078  */
   1079 int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
   1080 {
   1081     BlockDriver *drv = bs->drv;
   1082     IO_CODE();
   1083 
   1084     if (!drv) {
   1085         return -ENOMEDIUM;
   1086     }
   1087 
   1088     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
   1089     if (bdrv_is_sg(bs))
   1090         return 0;
   1091 
   1092     /* query actual device if possible, otherwise just trust the hint */
   1093     if (drv->bdrv_getlength) {
   1094         int64_t length = drv->bdrv_getlength(bs);
   1095         if (length < 0) {
   1096             return length;
   1097         }
   1098         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
   1099     }
   1100 
   1101     bs->total_sectors = hint;
   1102 
   1103     if (bs->total_sectors * BDRV_SECTOR_SIZE > BDRV_MAX_LENGTH) {
   1104         return -EFBIG;
   1105     }
   1106 
   1107     return 0;
   1108 }
   1109 
   1110 /**
   1111  * Combines a QDict of new block driver @options with any missing options taken
   1112  * from @old_options, so that leaving out an option defaults to its old value.
   1113  */
   1114 static void bdrv_join_options(BlockDriverState *bs, QDict *options,
   1115                               QDict *old_options)
   1116 {
   1117     GLOBAL_STATE_CODE();
   1118     if (bs->drv && bs->drv->bdrv_join_options) {
   1119         bs->drv->bdrv_join_options(options, old_options);
   1120     } else {
   1121         qdict_join(options, old_options, false);
   1122     }
   1123 }
   1124 
   1125 static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts,
   1126                                                             int open_flags,
   1127                                                             Error **errp)
   1128 {
   1129     Error *local_err = NULL;
   1130     char *value = qemu_opt_get_del(opts, "detect-zeroes");
   1131     BlockdevDetectZeroesOptions detect_zeroes =
   1132         qapi_enum_parse(&BlockdevDetectZeroesOptions_lookup, value,
   1133                         BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF, &local_err);
   1134     GLOBAL_STATE_CODE();
   1135     g_free(value);
   1136     if (local_err) {
   1137         error_propagate(errp, local_err);
   1138         return detect_zeroes;
   1139     }
   1140 
   1141     if (detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
   1142         !(open_flags & BDRV_O_UNMAP))
   1143     {
   1144         error_setg(errp, "setting detect-zeroes to unmap is not allowed "
   1145                    "without setting discard operation to unmap");
   1146     }
   1147 
   1148     return detect_zeroes;
   1149 }
   1150 
   1151 /**
   1152  * Set open flags for aio engine
   1153  *
   1154  * Return 0 on success, -1 if the engine specified is invalid
   1155  */
   1156 int bdrv_parse_aio(const char *mode, int *flags)
   1157 {
   1158     if (!strcmp(mode, "threads")) {
   1159         /* do nothing, default */
   1160     } else if (!strcmp(mode, "native")) {
   1161         *flags |= BDRV_O_NATIVE_AIO;
   1162 #ifdef CONFIG_LINUX_IO_URING
   1163     } else if (!strcmp(mode, "io_uring")) {
   1164         *flags |= BDRV_O_IO_URING;
   1165 #endif
   1166     } else {
   1167         return -1;
   1168     }
   1169 
   1170     return 0;
   1171 }
   1172 
   1173 /**
   1174  * Set open flags for a given discard mode
   1175  *
   1176  * Return 0 on success, -1 if the discard mode was invalid.
   1177  */
   1178 int bdrv_parse_discard_flags(const char *mode, int *flags)
   1179 {
   1180     *flags &= ~BDRV_O_UNMAP;
   1181 
   1182     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
   1183         /* do nothing */
   1184     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
   1185         *flags |= BDRV_O_UNMAP;
   1186     } else {
   1187         return -1;
   1188     }
   1189 
   1190     return 0;
   1191 }
   1192 
   1193 /**
   1194  * Set open flags for a given cache mode
   1195  *
   1196  * Return 0 on success, -1 if the cache mode was invalid.
   1197  */
   1198 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough)
   1199 {
   1200     *flags &= ~BDRV_O_CACHE_MASK;
   1201 
   1202     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
   1203         *writethrough = false;
   1204         *flags |= BDRV_O_NOCACHE;
   1205     } else if (!strcmp(mode, "directsync")) {
   1206         *writethrough = true;
   1207         *flags |= BDRV_O_NOCACHE;
   1208     } else if (!strcmp(mode, "writeback")) {
   1209         *writethrough = false;
   1210     } else if (!strcmp(mode, "unsafe")) {
   1211         *writethrough = false;
   1212         *flags |= BDRV_O_NO_FLUSH;
   1213     } else if (!strcmp(mode, "writethrough")) {
   1214         *writethrough = true;
   1215     } else {
   1216         return -1;
   1217     }
   1218 
   1219     return 0;
   1220 }
   1221 
   1222 static char *bdrv_child_get_parent_desc(BdrvChild *c)
   1223 {
   1224     BlockDriverState *parent = c->opaque;
   1225     return g_strdup_printf("node '%s'", bdrv_get_node_name(parent));
   1226 }
   1227 
   1228 static void bdrv_child_cb_drained_begin(BdrvChild *child)
   1229 {
   1230     BlockDriverState *bs = child->opaque;
   1231     bdrv_do_drained_begin_quiesce(bs, NULL, false);
   1232 }
   1233 
   1234 static bool bdrv_child_cb_drained_poll(BdrvChild *child)
   1235 {
   1236     BlockDriverState *bs = child->opaque;
   1237     return bdrv_drain_poll(bs, false, NULL, false);
   1238 }
   1239 
   1240 static void bdrv_child_cb_drained_end(BdrvChild *child,
   1241                                       int *drained_end_counter)
   1242 {
   1243     BlockDriverState *bs = child->opaque;
   1244     bdrv_drained_end_no_poll(bs, drained_end_counter);
   1245 }
   1246 
   1247 static int bdrv_child_cb_inactivate(BdrvChild *child)
   1248 {
   1249     BlockDriverState *bs = child->opaque;
   1250     GLOBAL_STATE_CODE();
   1251     assert(bs->open_flags & BDRV_O_INACTIVE);
   1252     return 0;
   1253 }
   1254 
   1255 static bool bdrv_child_cb_change_aio_ctx(BdrvChild *child, AioContext *ctx,
   1256                                          GHashTable *visited, Transaction *tran,
   1257                                          Error **errp)
   1258 {
   1259     BlockDriverState *bs = child->opaque;
   1260     return bdrv_change_aio_context(bs, ctx, visited, tran, errp);
   1261 }
   1262 
   1263 /*
   1264  * Returns the options and flags that a temporary snapshot should get, based on
   1265  * the originally requested flags (the originally requested image will have
   1266  * flags like a backing file)
   1267  */
   1268 static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
   1269                                        int parent_flags, QDict *parent_options)
   1270 {
   1271     GLOBAL_STATE_CODE();
   1272     *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
   1273 
   1274     /* For temporary files, unconditional cache=unsafe is fine */
   1275     qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
   1276     qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
   1277 
   1278     /* Copy the read-only and discard options from the parent */
   1279     qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
   1280     qdict_copy_default(child_options, parent_options, BDRV_OPT_DISCARD);
   1281 
   1282     /* aio=native doesn't work for cache.direct=off, so disable it for the
   1283      * temporary snapshot */
   1284     *child_flags &= ~BDRV_O_NATIVE_AIO;
   1285 }
   1286 
   1287 static void bdrv_backing_attach(BdrvChild *c)
   1288 {
   1289     BlockDriverState *parent = c->opaque;
   1290     BlockDriverState *backing_hd = c->bs;
   1291 
   1292     GLOBAL_STATE_CODE();
   1293     assert(!parent->backing_blocker);
   1294     error_setg(&parent->backing_blocker,
   1295                "node is used as backing hd of '%s'",
   1296                bdrv_get_device_or_node_name(parent));
   1297 
   1298     bdrv_refresh_filename(backing_hd);
   1299 
   1300     parent->open_flags &= ~BDRV_O_NO_BACKING;
   1301 
   1302     bdrv_op_block_all(backing_hd, parent->backing_blocker);
   1303     /* Otherwise we won't be able to commit or stream */
   1304     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
   1305                     parent->backing_blocker);
   1306     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_STREAM,
   1307                     parent->backing_blocker);
   1308     /*
   1309      * We do backup in 3 ways:
   1310      * 1. drive backup
   1311      *    The target bs is new opened, and the source is top BDS
   1312      * 2. blockdev backup
   1313      *    Both the source and the target are top BDSes.
   1314      * 3. internal backup(used for block replication)
   1315      *    Both the source and the target are backing file
   1316      *
   1317      * In case 1 and 2, neither the source nor the target is the backing file.
   1318      * In case 3, we will block the top BDS, so there is only one block job
   1319      * for the top BDS and its backing chain.
   1320      */
   1321     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
   1322                     parent->backing_blocker);
   1323     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
   1324                     parent->backing_blocker);
   1325 }
   1326 
   1327 static void bdrv_backing_detach(BdrvChild *c)
   1328 {
   1329     BlockDriverState *parent = c->opaque;
   1330 
   1331     GLOBAL_STATE_CODE();
   1332     assert(parent->backing_blocker);
   1333     bdrv_op_unblock_all(c->bs, parent->backing_blocker);
   1334     error_free(parent->backing_blocker);
   1335     parent->backing_blocker = NULL;
   1336 }
   1337 
   1338 static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
   1339                                         const char *filename, Error **errp)
   1340 {
   1341     BlockDriverState *parent = c->opaque;
   1342     bool read_only = bdrv_is_read_only(parent);
   1343     int ret;
   1344     GLOBAL_STATE_CODE();
   1345 
   1346     if (read_only) {
   1347         ret = bdrv_reopen_set_read_only(parent, false, errp);
   1348         if (ret < 0) {
   1349             return ret;
   1350         }
   1351     }
   1352 
   1353     ret = bdrv_change_backing_file(parent, filename,
   1354                                    base->drv ? base->drv->format_name : "",
   1355                                    false);
   1356     if (ret < 0) {
   1357         error_setg_errno(errp, -ret, "Could not update backing file link");
   1358     }
   1359 
   1360     if (read_only) {
   1361         bdrv_reopen_set_read_only(parent, true, NULL);
   1362     }
   1363 
   1364     return ret;
   1365 }
   1366 
   1367 /*
   1368  * Returns the options and flags that a generic child of a BDS should
   1369  * get, based on the given options and flags for the parent BDS.
   1370  */
   1371 static void bdrv_inherited_options(BdrvChildRole role, bool parent_is_format,
   1372                                    int *child_flags, QDict *child_options,
   1373                                    int parent_flags, QDict *parent_options)
   1374 {
   1375     int flags = parent_flags;
   1376     GLOBAL_STATE_CODE();
   1377 
   1378     /*
   1379      * First, decide whether to set, clear, or leave BDRV_O_PROTOCOL.
   1380      * Generally, the question to answer is: Should this child be
   1381      * format-probed by default?
   1382      */
   1383 
   1384     /*
   1385      * Pure and non-filtered data children of non-format nodes should
   1386      * be probed by default (even when the node itself has BDRV_O_PROTOCOL
   1387      * set).  This only affects a very limited set of drivers (namely
   1388      * quorum and blkverify when this comment was written).
   1389      * Force-clear BDRV_O_PROTOCOL then.
   1390      */
   1391     if (!parent_is_format &&
   1392         (role & BDRV_CHILD_DATA) &&
   1393         !(role & (BDRV_CHILD_METADATA | BDRV_CHILD_FILTERED)))
   1394     {
   1395         flags &= ~BDRV_O_PROTOCOL;
   1396     }
   1397 
   1398     /*
   1399      * All children of format nodes (except for COW children) and all
   1400      * metadata children in general should never be format-probed.
   1401      * Force-set BDRV_O_PROTOCOL then.
   1402      */
   1403     if ((parent_is_format && !(role & BDRV_CHILD_COW)) ||
   1404         (role & BDRV_CHILD_METADATA))
   1405     {
   1406         flags |= BDRV_O_PROTOCOL;
   1407     }
   1408 
   1409     /*
   1410      * If the cache mode isn't explicitly set, inherit direct and no-flush from
   1411      * the parent.
   1412      */
   1413     qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_DIRECT);
   1414     qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_NO_FLUSH);
   1415     qdict_copy_default(child_options, parent_options, BDRV_OPT_FORCE_SHARE);
   1416 
   1417     if (role & BDRV_CHILD_COW) {
   1418         /* backing files are opened read-only by default */
   1419         qdict_set_default_str(child_options, BDRV_OPT_READ_ONLY, "on");
   1420         qdict_set_default_str(child_options, BDRV_OPT_AUTO_READ_ONLY, "off");
   1421     } else {
   1422         /* Inherit the read-only option from the parent if it's not set */
   1423         qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
   1424         qdict_copy_default(child_options, parent_options,
   1425                            BDRV_OPT_AUTO_READ_ONLY);
   1426     }
   1427 
   1428     /*
   1429      * bdrv_co_pdiscard() respects unmap policy for the parent, so we
   1430      * can default to enable it on lower layers regardless of the
   1431      * parent option.
   1432      */
   1433     qdict_set_default_str(child_options, BDRV_OPT_DISCARD, "unmap");
   1434 
   1435     /* Clear flags that only apply to the top layer */
   1436     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
   1437 
   1438     if (role & BDRV_CHILD_METADATA) {
   1439         flags &= ~BDRV_O_NO_IO;
   1440     }
   1441     if (role & BDRV_CHILD_COW) {
   1442         flags &= ~BDRV_O_TEMPORARY;
   1443     }
   1444 
   1445     *child_flags = flags;
   1446 }
   1447 
   1448 static void bdrv_child_cb_attach(BdrvChild *child)
   1449 {
   1450     BlockDriverState *bs = child->opaque;
   1451 
   1452     assert_bdrv_graph_writable(bs);
   1453     QLIST_INSERT_HEAD(&bs->children, child, next);
   1454     if (bs->drv->is_filter || (child->role & BDRV_CHILD_FILTERED)) {
   1455         /*
   1456          * Here we handle filters and block/raw-format.c when it behave like
   1457          * filter. They generally have a single PRIMARY child, which is also the
   1458          * FILTERED child, and that they may have multiple more children, which
   1459          * are neither PRIMARY nor FILTERED. And never we have a COW child here.
   1460          * So bs->file will be the PRIMARY child, unless the PRIMARY child goes
   1461          * into bs->backing on exceptional cases; and bs->backing will be
   1462          * nothing else.
   1463          */
   1464         assert(!(child->role & BDRV_CHILD_COW));
   1465         if (child->role & BDRV_CHILD_PRIMARY) {
   1466             assert(child->role & BDRV_CHILD_FILTERED);
   1467             assert(!bs->backing);
   1468             assert(!bs->file);
   1469 
   1470             if (bs->drv->filtered_child_is_backing) {
   1471                 bs->backing = child;
   1472             } else {
   1473                 bs->file = child;
   1474             }
   1475         } else {
   1476             assert(!(child->role & BDRV_CHILD_FILTERED));
   1477         }
   1478     } else if (child->role & BDRV_CHILD_COW) {
   1479         assert(bs->drv->supports_backing);
   1480         assert(!(child->role & BDRV_CHILD_PRIMARY));
   1481         assert(!bs->backing);
   1482         bs->backing = child;
   1483         bdrv_backing_attach(child);
   1484     } else if (child->role & BDRV_CHILD_PRIMARY) {
   1485         assert(!bs->file);
   1486         bs->file = child;
   1487     }
   1488 
   1489     bdrv_apply_subtree_drain(child, bs);
   1490 }
   1491 
   1492 static void bdrv_child_cb_detach(BdrvChild *child)
   1493 {
   1494     BlockDriverState *bs = child->opaque;
   1495 
   1496     if (child->role & BDRV_CHILD_COW) {
   1497         bdrv_backing_detach(child);
   1498     }
   1499 
   1500     bdrv_unapply_subtree_drain(child, bs);
   1501 
   1502     assert_bdrv_graph_writable(bs);
   1503     QLIST_REMOVE(child, next);
   1504     if (child == bs->backing) {
   1505         assert(child != bs->file);
   1506         bs->backing = NULL;
   1507     } else if (child == bs->file) {
   1508         bs->file = NULL;
   1509     }
   1510 }
   1511 
   1512 static int bdrv_child_cb_update_filename(BdrvChild *c, BlockDriverState *base,
   1513                                          const char *filename, Error **errp)
   1514 {
   1515     if (c->role & BDRV_CHILD_COW) {
   1516         return bdrv_backing_update_filename(c, base, filename, errp);
   1517     }
   1518     return 0;
   1519 }
   1520 
   1521 AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c)
   1522 {
   1523     BlockDriverState *bs = c->opaque;
   1524     IO_CODE();
   1525 
   1526     return bdrv_get_aio_context(bs);
   1527 }
   1528 
   1529 const BdrvChildClass child_of_bds = {
   1530     .parent_is_bds   = true,
   1531     .get_parent_desc = bdrv_child_get_parent_desc,
   1532     .inherit_options = bdrv_inherited_options,
   1533     .drained_begin   = bdrv_child_cb_drained_begin,
   1534     .drained_poll    = bdrv_child_cb_drained_poll,
   1535     .drained_end     = bdrv_child_cb_drained_end,
   1536     .attach          = bdrv_child_cb_attach,
   1537     .detach          = bdrv_child_cb_detach,
   1538     .inactivate      = bdrv_child_cb_inactivate,
   1539     .change_aio_ctx  = bdrv_child_cb_change_aio_ctx,
   1540     .update_filename = bdrv_child_cb_update_filename,
   1541     .get_parent_aio_context = child_of_bds_get_parent_aio_context,
   1542 };
   1543 
   1544 AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
   1545 {
   1546     IO_CODE();
   1547     return c->klass->get_parent_aio_context(c);
   1548 }
   1549 
   1550 static int bdrv_open_flags(BlockDriverState *bs, int flags)
   1551 {
   1552     int open_flags = flags;
   1553     GLOBAL_STATE_CODE();
   1554 
   1555     /*
   1556      * Clear flags that are internal to the block layer before opening the
   1557      * image.
   1558      */
   1559     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
   1560 
   1561     return open_flags;
   1562 }
   1563 
   1564 static void update_flags_from_options(int *flags, QemuOpts *opts)
   1565 {
   1566     GLOBAL_STATE_CODE();
   1567 
   1568     *flags &= ~(BDRV_O_CACHE_MASK | BDRV_O_RDWR | BDRV_O_AUTO_RDONLY);
   1569 
   1570     if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_NO_FLUSH, false)) {
   1571         *flags |= BDRV_O_NO_FLUSH;
   1572     }
   1573 
   1574     if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_DIRECT, false)) {
   1575         *flags |= BDRV_O_NOCACHE;
   1576     }
   1577 
   1578     if (!qemu_opt_get_bool_del(opts, BDRV_OPT_READ_ONLY, false)) {
   1579         *flags |= BDRV_O_RDWR;
   1580     }
   1581 
   1582     if (qemu_opt_get_bool_del(opts, BDRV_OPT_AUTO_READ_ONLY, false)) {
   1583         *flags |= BDRV_O_AUTO_RDONLY;
   1584     }
   1585 }
   1586 
   1587 static void update_options_from_flags(QDict *options, int flags)
   1588 {
   1589     GLOBAL_STATE_CODE();
   1590     if (!qdict_haskey(options, BDRV_OPT_CACHE_DIRECT)) {
   1591         qdict_put_bool(options, BDRV_OPT_CACHE_DIRECT, flags & BDRV_O_NOCACHE);
   1592     }
   1593     if (!qdict_haskey(options, BDRV_OPT_CACHE_NO_FLUSH)) {
   1594         qdict_put_bool(options, BDRV_OPT_CACHE_NO_FLUSH,
   1595                        flags & BDRV_O_NO_FLUSH);
   1596     }
   1597     if (!qdict_haskey(options, BDRV_OPT_READ_ONLY)) {
   1598         qdict_put_bool(options, BDRV_OPT_READ_ONLY, !(flags & BDRV_O_RDWR));
   1599     }
   1600     if (!qdict_haskey(options, BDRV_OPT_AUTO_READ_ONLY)) {
   1601         qdict_put_bool(options, BDRV_OPT_AUTO_READ_ONLY,
   1602                        flags & BDRV_O_AUTO_RDONLY);
   1603     }
   1604 }
   1605 
   1606 static void bdrv_assign_node_name(BlockDriverState *bs,
   1607                                   const char *node_name,
   1608                                   Error **errp)
   1609 {
   1610     char *gen_node_name = NULL;
   1611     GLOBAL_STATE_CODE();
   1612 
   1613     if (!node_name) {
   1614         node_name = gen_node_name = id_generate(ID_BLOCK);
   1615     } else if (!id_wellformed(node_name)) {
   1616         /*
   1617          * Check for empty string or invalid characters, but not if it is
   1618          * generated (generated names use characters not available to the user)
   1619          */
   1620         error_setg(errp, "Invalid node-name: '%s'", node_name);
   1621         return;
   1622     }
   1623 
   1624     /* takes care of avoiding namespaces collisions */
   1625     if (blk_by_name(node_name)) {
   1626         error_setg(errp, "node-name=%s is conflicting with a device id",
   1627                    node_name);
   1628         goto out;
   1629     }
   1630 
   1631     /* takes care of avoiding duplicates node names */
   1632     if (bdrv_find_node(node_name)) {
   1633         error_setg(errp, "Duplicate nodes with node-name='%s'", node_name);
   1634         goto out;
   1635     }
   1636 
   1637     /* Make sure that the node name isn't truncated */
   1638     if (strlen(node_name) >= sizeof(bs->node_name)) {
   1639         error_setg(errp, "Node name too long");
   1640         goto out;
   1641     }
   1642 
   1643     /* copy node name into the bs and insert it into the graph list */
   1644     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
   1645     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
   1646 out:
   1647     g_free(gen_node_name);
   1648 }
   1649 
   1650 static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
   1651                             const char *node_name, QDict *options,
   1652                             int open_flags, Error **errp)
   1653 {
   1654     Error *local_err = NULL;
   1655     int i, ret;
   1656     GLOBAL_STATE_CODE();
   1657 
   1658     bdrv_assign_node_name(bs, node_name, &local_err);
   1659     if (local_err) {
   1660         error_propagate(errp, local_err);
   1661         return -EINVAL;
   1662     }
   1663 
   1664     bs->drv = drv;
   1665     bs->opaque = g_malloc0(drv->instance_size);
   1666 
   1667     if (drv->bdrv_file_open) {
   1668         assert(!drv->bdrv_needs_filename || bs->filename[0]);
   1669         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
   1670     } else if (drv->bdrv_open) {
   1671         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
   1672     } else {
   1673         ret = 0;
   1674     }
   1675 
   1676     if (ret < 0) {
   1677         if (local_err) {
   1678             error_propagate(errp, local_err);
   1679         } else if (bs->filename[0]) {
   1680             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
   1681         } else {
   1682             error_setg_errno(errp, -ret, "Could not open image");
   1683         }
   1684         goto open_failed;
   1685     }
   1686 
   1687     assert(!(bs->supported_read_flags & ~BDRV_REQ_MASK));
   1688     assert(!(bs->supported_write_flags & ~BDRV_REQ_MASK));
   1689 
   1690     /*
   1691      * Always allow the BDRV_REQ_REGISTERED_BUF optimization hint. This saves
   1692      * drivers that pass read/write requests through to a child the trouble of
   1693      * declaring support explicitly.
   1694      *
   1695      * Drivers must not propagate this flag accidentally when they initiate I/O
   1696      * to a bounce buffer. That case should be rare though.
   1697      */
   1698     bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF;
   1699     bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF;
   1700 
   1701     ret = refresh_total_sectors(bs, bs->total_sectors);
   1702     if (ret < 0) {
   1703         error_setg_errno(errp, -ret, "Could not refresh total sector count");
   1704         return ret;
   1705     }
   1706 
   1707     bdrv_refresh_limits(bs, NULL, &local_err);
   1708     if (local_err) {
   1709         error_propagate(errp, local_err);
   1710         return -EINVAL;
   1711     }
   1712 
   1713     assert(bdrv_opt_mem_align(bs) != 0);
   1714     assert(bdrv_min_mem_align(bs) != 0);
   1715     assert(is_power_of_2(bs->bl.request_alignment));
   1716 
   1717     for (i = 0; i < bs->quiesce_counter; i++) {
   1718         if (drv->bdrv_co_drain_begin) {
   1719             drv->bdrv_co_drain_begin(bs);
   1720         }
   1721     }
   1722 
   1723     return 0;
   1724 open_failed:
   1725     bs->drv = NULL;
   1726     if (bs->file != NULL) {
   1727         bdrv_unref_child(bs, bs->file);
   1728         assert(!bs->file);
   1729     }
   1730     g_free(bs->opaque);
   1731     bs->opaque = NULL;
   1732     return ret;
   1733 }
   1734 
   1735 /*
   1736  * Create and open a block node.
   1737  *
   1738  * @options is a QDict of options to pass to the block drivers, or NULL for an
   1739  * empty set of options. The reference to the QDict belongs to the block layer
   1740  * after the call (even on failure), so if the caller intends to reuse the
   1741  * dictionary, it needs to use qobject_ref() before calling bdrv_open.
   1742  */
   1743 BlockDriverState *bdrv_new_open_driver_opts(BlockDriver *drv,
   1744                                             const char *node_name,
   1745                                             QDict *options, int flags,
   1746                                             Error **errp)
   1747 {
   1748     BlockDriverState *bs;
   1749     int ret;
   1750 
   1751     GLOBAL_STATE_CODE();
   1752 
   1753     bs = bdrv_new();
   1754     bs->open_flags = flags;
   1755     bs->options = options ?: qdict_new();
   1756     bs->explicit_options = qdict_clone_shallow(bs->options);
   1757     bs->opaque = NULL;
   1758 
   1759     update_options_from_flags(bs->options, flags);
   1760 
   1761     ret = bdrv_open_driver(bs, drv, node_name, bs->options, flags, errp);
   1762     if (ret < 0) {
   1763         qobject_unref(bs->explicit_options);
   1764         bs->explicit_options = NULL;
   1765         qobject_unref(bs->options);
   1766         bs->options = NULL;
   1767         bdrv_unref(bs);
   1768         return NULL;
   1769     }
   1770 
   1771     return bs;
   1772 }
   1773 
   1774 /* Create and open a block node. */
   1775 BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
   1776                                        int flags, Error **errp)
   1777 {
   1778     GLOBAL_STATE_CODE();
   1779     return bdrv_new_open_driver_opts(drv, node_name, NULL, flags, errp);
   1780 }
   1781 
   1782 QemuOptsList bdrv_runtime_opts = {
   1783     .name = "bdrv_common",
   1784     .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
   1785     .desc = {
   1786         {
   1787             .name = "node-name",
   1788             .type = QEMU_OPT_STRING,
   1789             .help = "Node name of the block device node",
   1790         },
   1791         {
   1792             .name = "driver",
   1793             .type = QEMU_OPT_STRING,
   1794             .help = "Block driver to use for the node",
   1795         },
   1796         {
   1797             .name = BDRV_OPT_CACHE_DIRECT,
   1798             .type = QEMU_OPT_BOOL,
   1799             .help = "Bypass software writeback cache on the host",
   1800         },
   1801         {
   1802             .name = BDRV_OPT_CACHE_NO_FLUSH,
   1803             .type = QEMU_OPT_BOOL,
   1804             .help = "Ignore flush requests",
   1805         },
   1806         {
   1807             .name = BDRV_OPT_READ_ONLY,
   1808             .type = QEMU_OPT_BOOL,
   1809             .help = "Node is opened in read-only mode",
   1810         },
   1811         {
   1812             .name = BDRV_OPT_AUTO_READ_ONLY,
   1813             .type = QEMU_OPT_BOOL,
   1814             .help = "Node can become read-only if opening read-write fails",
   1815         },
   1816         {
   1817             .name = "detect-zeroes",
   1818             .type = QEMU_OPT_STRING,
   1819             .help = "try to optimize zero writes (off, on, unmap)",
   1820         },
   1821         {
   1822             .name = BDRV_OPT_DISCARD,
   1823             .type = QEMU_OPT_STRING,
   1824             .help = "discard operation (ignore/off, unmap/on)",
   1825         },
   1826         {
   1827             .name = BDRV_OPT_FORCE_SHARE,
   1828             .type = QEMU_OPT_BOOL,
   1829             .help = "always accept other writers (default: off)",
   1830         },
   1831         { /* end of list */ }
   1832     },
   1833 };
   1834 
   1835 QemuOptsList bdrv_create_opts_simple = {
   1836     .name = "simple-create-opts",
   1837     .head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head),
   1838     .desc = {
   1839         {
   1840             .name = BLOCK_OPT_SIZE,
   1841             .type = QEMU_OPT_SIZE,
   1842             .help = "Virtual disk size"
   1843         },
   1844         {
   1845             .name = BLOCK_OPT_PREALLOC,
   1846             .type = QEMU_OPT_STRING,
   1847             .help = "Preallocation mode (allowed values: off)"
   1848         },
   1849         { /* end of list */ }
   1850     }
   1851 };
   1852 
   1853 /*
   1854  * Common part for opening disk images and files
   1855  *
   1856  * Removes all processed options from *options.
   1857  */
   1858 static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
   1859                             QDict *options, Error **errp)
   1860 {
   1861     int ret, open_flags;
   1862     const char *filename;
   1863     const char *driver_name = NULL;
   1864     const char *node_name = NULL;
   1865     const char *discard;
   1866     QemuOpts *opts;
   1867     BlockDriver *drv;
   1868     Error *local_err = NULL;
   1869     bool ro;
   1870 
   1871     assert(bs->file == NULL);
   1872     assert(options != NULL && bs->options != options);
   1873     GLOBAL_STATE_CODE();
   1874 
   1875     opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
   1876     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
   1877         ret = -EINVAL;
   1878         goto fail_opts;
   1879     }
   1880 
   1881     update_flags_from_options(&bs->open_flags, opts);
   1882 
   1883     driver_name = qemu_opt_get(opts, "driver");
   1884     drv = bdrv_find_format(driver_name);
   1885     assert(drv != NULL);
   1886 
   1887     bs->force_share = qemu_opt_get_bool(opts, BDRV_OPT_FORCE_SHARE, false);
   1888 
   1889     if (bs->force_share && (bs->open_flags & BDRV_O_RDWR)) {
   1890         error_setg(errp,
   1891                    BDRV_OPT_FORCE_SHARE
   1892                    "=on can only be used with read-only images");
   1893         ret = -EINVAL;
   1894         goto fail_opts;
   1895     }
   1896 
   1897     if (file != NULL) {
   1898         bdrv_refresh_filename(blk_bs(file));
   1899         filename = blk_bs(file)->filename;
   1900     } else {
   1901         /*
   1902          * Caution: while qdict_get_try_str() is fine, getting
   1903          * non-string types would require more care.  When @options
   1904          * come from -blockdev or blockdev_add, its members are typed
   1905          * according to the QAPI schema, but when they come from
   1906          * -drive, they're all QString.
   1907          */
   1908         filename = qdict_get_try_str(options, "filename");
   1909     }
   1910 
   1911     if (drv->bdrv_needs_filename && (!filename || !filename[0])) {
   1912         error_setg(errp, "The '%s' block driver requires a file name",
   1913                    drv->format_name);
   1914         ret = -EINVAL;
   1915         goto fail_opts;
   1916     }
   1917 
   1918     trace_bdrv_open_common(bs, filename ?: "", bs->open_flags,
   1919                            drv->format_name);
   1920 
   1921     ro = bdrv_is_read_only(bs);
   1922 
   1923     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, ro)) {
   1924         if (!ro && bdrv_is_whitelisted(drv, true)) {
   1925             ret = bdrv_apply_auto_read_only(bs, NULL, NULL);
   1926         } else {
   1927             ret = -ENOTSUP;
   1928         }
   1929         if (ret < 0) {
   1930             error_setg(errp,
   1931                        !ro && bdrv_is_whitelisted(drv, true)
   1932                        ? "Driver '%s' can only be used for read-only devices"
   1933                        : "Driver '%s' is not whitelisted",
   1934                        drv->format_name);
   1935             goto fail_opts;
   1936         }
   1937     }
   1938 
   1939     /* bdrv_new() and bdrv_close() make it so */
   1940     assert(qatomic_read(&bs->copy_on_read) == 0);
   1941 
   1942     if (bs->open_flags & BDRV_O_COPY_ON_READ) {
   1943         if (!ro) {
   1944             bdrv_enable_copy_on_read(bs);
   1945         } else {
   1946             error_setg(errp, "Can't use copy-on-read on read-only device");
   1947             ret = -EINVAL;
   1948             goto fail_opts;
   1949         }
   1950     }
   1951 
   1952     discard = qemu_opt_get(opts, BDRV_OPT_DISCARD);
   1953     if (discard != NULL) {
   1954         if (bdrv_parse_discard_flags(discard, &bs->open_flags) != 0) {
   1955             error_setg(errp, "Invalid discard option");
   1956             ret = -EINVAL;
   1957             goto fail_opts;
   1958         }
   1959     }
   1960 
   1961     bs->detect_zeroes =
   1962         bdrv_parse_detect_zeroes(opts, bs->open_flags, &local_err);
   1963     if (local_err) {
   1964         error_propagate(errp, local_err);
   1965         ret = -EINVAL;
   1966         goto fail_opts;
   1967     }
   1968 
   1969     if (filename != NULL) {
   1970         pstrcpy(bs->filename, sizeof(bs->filename), filename);
   1971     } else {
   1972         bs->filename[0] = '\0';
   1973     }
   1974     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
   1975 
   1976     /* Open the image, either directly or using a protocol */
   1977     open_flags = bdrv_open_flags(bs, bs->open_flags);
   1978     node_name = qemu_opt_get(opts, "node-name");
   1979 
   1980     assert(!drv->bdrv_file_open || file == NULL);
   1981     ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp);
   1982     if (ret < 0) {
   1983         goto fail_opts;
   1984     }
   1985 
   1986     qemu_opts_del(opts);
   1987     return 0;
   1988 
   1989 fail_opts:
   1990     qemu_opts_del(opts);
   1991     return ret;
   1992 }
   1993 
   1994 static QDict *parse_json_filename(const char *filename, Error **errp)
   1995 {
   1996     QObject *options_obj;
   1997     QDict *options;
   1998     int ret;
   1999     GLOBAL_STATE_CODE();
   2000 
   2001     ret = strstart(filename, "json:", &filename);
   2002     assert(ret);
   2003 
   2004     options_obj = qobject_from_json(filename, errp);
   2005     if (!options_obj) {
   2006         error_prepend(errp, "Could not parse the JSON options: ");
   2007         return NULL;
   2008     }
   2009 
   2010     options = qobject_to(QDict, options_obj);
   2011     if (!options) {
   2012         qobject_unref(options_obj);
   2013         error_setg(errp, "Invalid JSON object given");
   2014         return NULL;
   2015     }
   2016 
   2017     qdict_flatten(options);
   2018 
   2019     return options;
   2020 }
   2021 
   2022 static void parse_json_protocol(QDict *options, const char **pfilename,
   2023                                 Error **errp)
   2024 {
   2025     QDict *json_options;
   2026     Error *local_err = NULL;
   2027     GLOBAL_STATE_CODE();
   2028 
   2029     /* Parse json: pseudo-protocol */
   2030     if (!*pfilename || !g_str_has_prefix(*pfilename, "json:")) {
   2031         return;
   2032     }
   2033 
   2034     json_options = parse_json_filename(*pfilename, &local_err);
   2035     if (local_err) {
   2036         error_propagate(errp, local_err);
   2037         return;
   2038     }
   2039 
   2040     /* Options given in the filename have lower priority than options
   2041      * specified directly */
   2042     qdict_join(options, json_options, false);
   2043     qobject_unref(json_options);
   2044     *pfilename = NULL;
   2045 }
   2046 
   2047 /*
   2048  * Fills in default options for opening images and converts the legacy
   2049  * filename/flags pair to option QDict entries.
   2050  * The BDRV_O_PROTOCOL flag in *flags will be set or cleared accordingly if a
   2051  * block driver has been specified explicitly.
   2052  */
   2053 static int bdrv_fill_options(QDict **options, const char *filename,
   2054                              int *flags, Error **errp)
   2055 {
   2056     const char *drvname;
   2057     bool protocol = *flags & BDRV_O_PROTOCOL;
   2058     bool parse_filename = false;
   2059     BlockDriver *drv = NULL;
   2060     Error *local_err = NULL;
   2061 
   2062     GLOBAL_STATE_CODE();
   2063 
   2064     /*
   2065      * Caution: while qdict_get_try_str() is fine, getting non-string
   2066      * types would require more care.  When @options come from
   2067      * -blockdev or blockdev_add, its members are typed according to
   2068      * the QAPI schema, but when they come from -drive, they're all
   2069      * QString.
   2070      */
   2071     drvname = qdict_get_try_str(*options, "driver");
   2072     if (drvname) {
   2073         drv = bdrv_find_format(drvname);
   2074         if (!drv) {
   2075             error_setg(errp, "Unknown driver '%s'", drvname);
   2076             return -ENOENT;
   2077         }
   2078         /* If the user has explicitly specified the driver, this choice should
   2079          * override the BDRV_O_PROTOCOL flag */
   2080         protocol = drv->bdrv_file_open;
   2081     }
   2082 
   2083     if (protocol) {
   2084         *flags |= BDRV_O_PROTOCOL;
   2085     } else {
   2086         *flags &= ~BDRV_O_PROTOCOL;
   2087     }
   2088 
   2089     /* Translate cache options from flags into options */
   2090     update_options_from_flags(*options, *flags);
   2091 
   2092     /* Fetch the file name from the options QDict if necessary */
   2093     if (protocol && filename) {
   2094         if (!qdict_haskey(*options, "filename")) {
   2095             qdict_put_str(*options, "filename", filename);
   2096             parse_filename = true;
   2097         } else {
   2098             error_setg(errp, "Can't specify 'file' and 'filename' options at "
   2099                              "the same time");
   2100             return -EINVAL;
   2101         }
   2102     }
   2103 
   2104     /* Find the right block driver */
   2105     /* See cautionary note on accessing @options above */
   2106     filename = qdict_get_try_str(*options, "filename");
   2107 
   2108     if (!drvname && protocol) {
   2109         if (filename) {
   2110             drv = bdrv_find_protocol(filename, parse_filename, errp);
   2111             if (!drv) {
   2112                 return -EINVAL;
   2113             }
   2114 
   2115             drvname = drv->format_name;
   2116             qdict_put_str(*options, "driver", drvname);
   2117         } else {
   2118             error_setg(errp, "Must specify either driver or file");
   2119             return -EINVAL;
   2120         }
   2121     }
   2122 
   2123     assert(drv || !protocol);
   2124 
   2125     /* Driver-specific filename parsing */
   2126     if (drv && drv->bdrv_parse_filename && parse_filename) {
   2127         drv->bdrv_parse_filename(filename, *options, &local_err);
   2128         if (local_err) {
   2129             error_propagate(errp, local_err);
   2130             return -EINVAL;
   2131         }
   2132 
   2133         if (!drv->bdrv_needs_filename) {
   2134             qdict_del(*options, "filename");
   2135         }
   2136     }
   2137 
   2138     return 0;
   2139 }
   2140 
   2141 typedef struct BlockReopenQueueEntry {
   2142      bool prepared;
   2143      bool perms_checked;
   2144      BDRVReopenState state;
   2145      QTAILQ_ENTRY(BlockReopenQueueEntry) entry;
   2146 } BlockReopenQueueEntry;
   2147 
   2148 /*
   2149  * Return the flags that @bs will have after the reopens in @q have
   2150  * successfully completed. If @q is NULL (or @bs is not contained in @q),
   2151  * return the current flags.
   2152  */
   2153 static int bdrv_reopen_get_flags(BlockReopenQueue *q, BlockDriverState *bs)
   2154 {
   2155     BlockReopenQueueEntry *entry;
   2156 
   2157     if (q != NULL) {
   2158         QTAILQ_FOREACH(entry, q, entry) {
   2159             if (entry->state.bs == bs) {
   2160                 return entry->state.flags;
   2161             }
   2162         }
   2163     }
   2164 
   2165     return bs->open_flags;
   2166 }
   2167 
   2168 /* Returns whether the image file can be written to after the reopen queue @q
   2169  * has been successfully applied, or right now if @q is NULL. */
   2170 static bool bdrv_is_writable_after_reopen(BlockDriverState *bs,
   2171                                           BlockReopenQueue *q)
   2172 {
   2173     int flags = bdrv_reopen_get_flags(q, bs);
   2174 
   2175     return (flags & (BDRV_O_RDWR | BDRV_O_INACTIVE)) == BDRV_O_RDWR;
   2176 }
   2177 
   2178 /*
   2179  * Return whether the BDS can be written to.  This is not necessarily
   2180  * the same as !bdrv_is_read_only(bs), as inactivated images may not
   2181  * be written to but do not count as read-only images.
   2182  */
   2183 bool bdrv_is_writable(BlockDriverState *bs)
   2184 {
   2185     IO_CODE();
   2186     return bdrv_is_writable_after_reopen(bs, NULL);
   2187 }
   2188 
   2189 static char *bdrv_child_user_desc(BdrvChild *c)
   2190 {
   2191     GLOBAL_STATE_CODE();
   2192     return c->klass->get_parent_desc(c);
   2193 }
   2194 
   2195 /*
   2196  * Check that @a allows everything that @b needs. @a and @b must reference same
   2197  * child node.
   2198  */
   2199 static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, Error **errp)
   2200 {
   2201     const char *child_bs_name;
   2202     g_autofree char *a_user = NULL;
   2203     g_autofree char *b_user = NULL;
   2204     g_autofree char *perms = NULL;
   2205 
   2206     assert(a->bs);
   2207     assert(a->bs == b->bs);
   2208     GLOBAL_STATE_CODE();
   2209 
   2210     if ((b->perm & a->shared_perm) == b->perm) {
   2211         return true;
   2212     }
   2213 
   2214     child_bs_name = bdrv_get_node_name(b->bs);
   2215     a_user = bdrv_child_user_desc(a);
   2216     b_user = bdrv_child_user_desc(b);
   2217     perms = bdrv_perm_names(b->perm & ~a->shared_perm);
   2218 
   2219     error_setg(errp, "Permission conflict on node '%s': permissions '%s' are "
   2220                "both required by %s (uses node '%s' as '%s' child) and "
   2221                "unshared by %s (uses node '%s' as '%s' child).",
   2222                child_bs_name, perms,
   2223                b_user, child_bs_name, b->name,
   2224                a_user, child_bs_name, a->name);
   2225 
   2226     return false;
   2227 }
   2228 
   2229 static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
   2230 {
   2231     BdrvChild *a, *b;
   2232     GLOBAL_STATE_CODE();
   2233 
   2234     /*
   2235      * During the loop we'll look at each pair twice. That's correct because
   2236      * bdrv_a_allow_b() is asymmetric and we should check each pair in both
   2237      * directions.
   2238      */
   2239     QLIST_FOREACH(a, &bs->parents, next_parent) {
   2240         QLIST_FOREACH(b, &bs->parents, next_parent) {
   2241             if (a == b) {
   2242                 continue;
   2243             }
   2244 
   2245             if (!bdrv_a_allow_b(a, b, errp)) {
   2246                 return true;
   2247             }
   2248         }
   2249     }
   2250 
   2251     return false;
   2252 }
   2253 
   2254 static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
   2255                             BdrvChild *c, BdrvChildRole role,
   2256                             BlockReopenQueue *reopen_queue,
   2257                             uint64_t parent_perm, uint64_t parent_shared,
   2258                             uint64_t *nperm, uint64_t *nshared)
   2259 {
   2260     assert(bs->drv && bs->drv->bdrv_child_perm);
   2261     GLOBAL_STATE_CODE();
   2262     bs->drv->bdrv_child_perm(bs, c, role, reopen_queue,
   2263                              parent_perm, parent_shared,
   2264                              nperm, nshared);
   2265     /* TODO Take force_share from reopen_queue */
   2266     if (child_bs && child_bs->force_share) {
   2267         *nshared = BLK_PERM_ALL;
   2268     }
   2269 }
   2270 
   2271 /*
   2272  * Adds the whole subtree of @bs (including @bs itself) to the @list (except for
   2273  * nodes that are already in the @list, of course) so that final list is
   2274  * topologically sorted. Return the result (GSList @list object is updated, so
   2275  * don't use old reference after function call).
   2276  *
   2277  * On function start @list must be already topologically sorted and for any node
   2278  * in the @list the whole subtree of the node must be in the @list as well. The
   2279  * simplest way to satisfy this criteria: use only result of
   2280  * bdrv_topological_dfs() or NULL as @list parameter.
   2281  */
   2282 static GSList *bdrv_topological_dfs(GSList *list, GHashTable *found,
   2283                                     BlockDriverState *bs)
   2284 {
   2285     BdrvChild *child;
   2286     g_autoptr(GHashTable) local_found = NULL;
   2287 
   2288     GLOBAL_STATE_CODE();
   2289 
   2290     if (!found) {
   2291         assert(!list);
   2292         found = local_found = g_hash_table_new(NULL, NULL);
   2293     }
   2294 
   2295     if (g_hash_table_contains(found, bs)) {
   2296         return list;
   2297     }
   2298     g_hash_table_add(found, bs);
   2299 
   2300     QLIST_FOREACH(child, &bs->children, next) {
   2301         list = bdrv_topological_dfs(list, found, child->bs);
   2302     }
   2303 
   2304     return g_slist_prepend(list, bs);
   2305 }
   2306 
   2307 typedef struct BdrvChildSetPermState {
   2308     BdrvChild *child;
   2309     uint64_t old_perm;
   2310     uint64_t old_shared_perm;
   2311 } BdrvChildSetPermState;
   2312 
   2313 static void bdrv_child_set_perm_abort(void *opaque)
   2314 {
   2315     BdrvChildSetPermState *s = opaque;
   2316 
   2317     GLOBAL_STATE_CODE();
   2318 
   2319     s->child->perm = s->old_perm;
   2320     s->child->shared_perm = s->old_shared_perm;
   2321 }
   2322 
   2323 static TransactionActionDrv bdrv_child_set_pem_drv = {
   2324     .abort = bdrv_child_set_perm_abort,
   2325     .clean = g_free,
   2326 };
   2327 
   2328 static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm,
   2329                                 uint64_t shared, Transaction *tran)
   2330 {
   2331     BdrvChildSetPermState *s = g_new(BdrvChildSetPermState, 1);
   2332     GLOBAL_STATE_CODE();
   2333 
   2334     *s = (BdrvChildSetPermState) {
   2335         .child = c,
   2336         .old_perm = c->perm,
   2337         .old_shared_perm = c->shared_perm,
   2338     };
   2339 
   2340     c->perm = perm;
   2341     c->shared_perm = shared;
   2342 
   2343     tran_add(tran, &bdrv_child_set_pem_drv, s);
   2344 }
   2345 
   2346 static void bdrv_drv_set_perm_commit(void *opaque)
   2347 {
   2348     BlockDriverState *bs = opaque;
   2349     uint64_t cumulative_perms, cumulative_shared_perms;
   2350     GLOBAL_STATE_CODE();
   2351 
   2352     if (bs->drv->bdrv_set_perm) {
   2353         bdrv_get_cumulative_perm(bs, &cumulative_perms,
   2354                                  &cumulative_shared_perms);
   2355         bs->drv->bdrv_set_perm(bs, cumulative_perms, cumulative_shared_perms);
   2356     }
   2357 }
   2358 
   2359 static void bdrv_drv_set_perm_abort(void *opaque)
   2360 {
   2361     BlockDriverState *bs = opaque;
   2362     GLOBAL_STATE_CODE();
   2363 
   2364     if (bs->drv->bdrv_abort_perm_update) {
   2365         bs->drv->bdrv_abort_perm_update(bs);
   2366     }
   2367 }
   2368 
   2369 TransactionActionDrv bdrv_drv_set_perm_drv = {
   2370     .abort = bdrv_drv_set_perm_abort,
   2371     .commit = bdrv_drv_set_perm_commit,
   2372 };
   2373 
   2374 static int bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm,
   2375                              uint64_t shared_perm, Transaction *tran,
   2376                              Error **errp)
   2377 {
   2378     GLOBAL_STATE_CODE();
   2379     if (!bs->drv) {
   2380         return 0;
   2381     }
   2382 
   2383     if (bs->drv->bdrv_check_perm) {
   2384         int ret = bs->drv->bdrv_check_perm(bs, perm, shared_perm, errp);
   2385         if (ret < 0) {
   2386             return ret;
   2387         }
   2388     }
   2389 
   2390     if (tran) {
   2391         tran_add(tran, &bdrv_drv_set_perm_drv, bs);
   2392     }
   2393 
   2394     return 0;
   2395 }
   2396 
   2397 typedef struct BdrvReplaceChildState {
   2398     BdrvChild *child;
   2399     BlockDriverState *old_bs;
   2400 } BdrvReplaceChildState;
   2401 
   2402 static void bdrv_replace_child_commit(void *opaque)
   2403 {
   2404     BdrvReplaceChildState *s = opaque;
   2405     GLOBAL_STATE_CODE();
   2406 
   2407     bdrv_unref(s->old_bs);
   2408 }
   2409 
   2410 static void bdrv_replace_child_abort(void *opaque)
   2411 {
   2412     BdrvReplaceChildState *s = opaque;
   2413     BlockDriverState *new_bs = s->child->bs;
   2414 
   2415     GLOBAL_STATE_CODE();
   2416     /* old_bs reference is transparently moved from @s to @s->child */
   2417     bdrv_replace_child_noperm(s->child, s->old_bs);
   2418     bdrv_unref(new_bs);
   2419 }
   2420 
   2421 static TransactionActionDrv bdrv_replace_child_drv = {
   2422     .commit = bdrv_replace_child_commit,
   2423     .abort = bdrv_replace_child_abort,
   2424     .clean = g_free,
   2425 };
   2426 
   2427 /*
   2428  * bdrv_replace_child_tran
   2429  *
   2430  * Note: real unref of old_bs is done only on commit.
   2431  *
   2432  * The function doesn't update permissions, caller is responsible for this.
   2433  */
   2434 static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
   2435                                     Transaction *tran)
   2436 {
   2437     BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
   2438     *s = (BdrvReplaceChildState) {
   2439         .child = child,
   2440         .old_bs = child->bs,
   2441     };
   2442     tran_add(tran, &bdrv_replace_child_drv, s);
   2443 
   2444     if (new_bs) {
   2445         bdrv_ref(new_bs);
   2446     }
   2447     bdrv_replace_child_noperm(child, new_bs);
   2448     /* old_bs reference is transparently moved from @child to @s */
   2449 }
   2450 
   2451 /*
   2452  * Refresh permissions in @bs subtree. The function is intended to be called
   2453  * after some graph modification that was done without permission update.
   2454  */
   2455 static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
   2456                                   Transaction *tran, Error **errp)
   2457 {
   2458     BlockDriver *drv = bs->drv;
   2459     BdrvChild *c;
   2460     int ret;
   2461     uint64_t cumulative_perms, cumulative_shared_perms;
   2462     GLOBAL_STATE_CODE();
   2463 
   2464     bdrv_get_cumulative_perm(bs, &cumulative_perms, &cumulative_shared_perms);
   2465 
   2466     /* Write permissions never work with read-only images */
   2467     if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
   2468         !bdrv_is_writable_after_reopen(bs, q))
   2469     {
   2470         if (!bdrv_is_writable_after_reopen(bs, NULL)) {
   2471             error_setg(errp, "Block node is read-only");
   2472         } else {
   2473             error_setg(errp, "Read-only block node '%s' cannot support "
   2474                        "read-write users", bdrv_get_node_name(bs));
   2475         }
   2476 
   2477         return -EPERM;
   2478     }
   2479 
   2480     /*
   2481      * Unaligned requests will automatically be aligned to bl.request_alignment
   2482      * and without RESIZE we can't extend requests to write to space beyond the
   2483      * end of the image, so it's required that the image size is aligned.
   2484      */
   2485     if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
   2486         !(cumulative_perms & BLK_PERM_RESIZE))
   2487     {
   2488         if ((bs->total_sectors * BDRV_SECTOR_SIZE) % bs->bl.request_alignment) {
   2489             error_setg(errp, "Cannot get 'write' permission without 'resize': "
   2490                              "Image size is not a multiple of request "
   2491                              "alignment");
   2492             return -EPERM;
   2493         }
   2494     }
   2495 
   2496     /* Check this node */
   2497     if (!drv) {
   2498         return 0;
   2499     }
   2500 
   2501     ret = bdrv_drv_set_perm(bs, cumulative_perms, cumulative_shared_perms, tran,
   2502                             errp);
   2503     if (ret < 0) {
   2504         return ret;
   2505     }
   2506 
   2507     /* Drivers that never have children can omit .bdrv_child_perm() */
   2508     if (!drv->bdrv_child_perm) {
   2509         assert(QLIST_EMPTY(&bs->children));
   2510         return 0;
   2511     }
   2512 
   2513     /* Check all children */
   2514     QLIST_FOREACH(c, &bs->children, next) {
   2515         uint64_t cur_perm, cur_shared;
   2516 
   2517         bdrv_child_perm(bs, c->bs, c, c->role, q,
   2518                         cumulative_perms, cumulative_shared_perms,
   2519                         &cur_perm, &cur_shared);
   2520         bdrv_child_set_perm(c, cur_perm, cur_shared, tran);
   2521     }
   2522 
   2523     return 0;
   2524 }
   2525 
   2526 static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
   2527                                    Transaction *tran, Error **errp)
   2528 {
   2529     int ret;
   2530     BlockDriverState *bs;
   2531     GLOBAL_STATE_CODE();
   2532 
   2533     for ( ; list; list = list->next) {
   2534         bs = list->data;
   2535 
   2536         if (bdrv_parent_perms_conflict(bs, errp)) {
   2537             return -EINVAL;
   2538         }
   2539 
   2540         ret = bdrv_node_refresh_perm(bs, q, tran, errp);
   2541         if (ret < 0) {
   2542             return ret;
   2543         }
   2544     }
   2545 
   2546     return 0;
   2547 }
   2548 
   2549 void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
   2550                               uint64_t *shared_perm)
   2551 {
   2552     BdrvChild *c;
   2553     uint64_t cumulative_perms = 0;
   2554     uint64_t cumulative_shared_perms = BLK_PERM_ALL;
   2555 
   2556     GLOBAL_STATE_CODE();
   2557 
   2558     QLIST_FOREACH(c, &bs->parents, next_parent) {
   2559         cumulative_perms |= c->perm;
   2560         cumulative_shared_perms &= c->shared_perm;
   2561     }
   2562 
   2563     *perm = cumulative_perms;
   2564     *shared_perm = cumulative_shared_perms;
   2565 }
   2566 
   2567 char *bdrv_perm_names(uint64_t perm)
   2568 {
   2569     struct perm_name {
   2570         uint64_t perm;
   2571         const char *name;
   2572     } permissions[] = {
   2573         { BLK_PERM_CONSISTENT_READ, "consistent read" },
   2574         { BLK_PERM_WRITE,           "write" },
   2575         { BLK_PERM_WRITE_UNCHANGED, "write unchanged" },
   2576         { BLK_PERM_RESIZE,          "resize" },
   2577         { 0, NULL }
   2578     };
   2579 
   2580     GString *result = g_string_sized_new(30);
   2581     struct perm_name *p;
   2582 
   2583     for (p = permissions; p->name; p++) {
   2584         if (perm & p->perm) {
   2585             if (result->len > 0) {
   2586                 g_string_append(result, ", ");
   2587             }
   2588             g_string_append(result, p->name);
   2589         }
   2590     }
   2591 
   2592     return g_string_free(result, FALSE);
   2593 }
   2594 
   2595 
   2596 static int bdrv_refresh_perms(BlockDriverState *bs, Error **errp)
   2597 {
   2598     int ret;
   2599     Transaction *tran = tran_new();
   2600     g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
   2601     GLOBAL_STATE_CODE();
   2602 
   2603     ret = bdrv_list_refresh_perms(list, NULL, tran, errp);
   2604     tran_finalize(tran, ret);
   2605 
   2606     return ret;
   2607 }
   2608 
   2609 int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
   2610                             Error **errp)
   2611 {
   2612     Error *local_err = NULL;
   2613     Transaction *tran = tran_new();
   2614     int ret;
   2615 
   2616     GLOBAL_STATE_CODE();
   2617 
   2618     bdrv_child_set_perm(c, perm, shared, tran);
   2619 
   2620     ret = bdrv_refresh_perms(c->bs, &local_err);
   2621 
   2622     tran_finalize(tran, ret);
   2623 
   2624     if (ret < 0) {
   2625         if ((perm & ~c->perm) || (c->shared_perm & ~shared)) {
   2626             /* tighten permissions */
   2627             error_propagate(errp, local_err);
   2628         } else {
   2629             /*
   2630              * Our caller may intend to only loosen restrictions and
   2631              * does not expect this function to fail.  Errors are not
   2632              * fatal in such a case, so we can just hide them from our
   2633              * caller.
   2634              */
   2635             error_free(local_err);
   2636             ret = 0;
   2637         }
   2638     }
   2639 
   2640     return ret;
   2641 }
   2642 
   2643 int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp)
   2644 {
   2645     uint64_t parent_perms, parent_shared;
   2646     uint64_t perms, shared;
   2647 
   2648     GLOBAL_STATE_CODE();
   2649 
   2650     bdrv_get_cumulative_perm(bs, &parent_perms, &parent_shared);
   2651     bdrv_child_perm(bs, c->bs, c, c->role, NULL,
   2652                     parent_perms, parent_shared, &perms, &shared);
   2653 
   2654     return bdrv_child_try_set_perm(c, perms, shared, errp);
   2655 }
   2656 
   2657 /*
   2658  * Default implementation for .bdrv_child_perm() for block filters:
   2659  * Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED, and RESIZE to the
   2660  * filtered child.
   2661  */
   2662 static void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
   2663                                       BdrvChildRole role,
   2664                                       BlockReopenQueue *reopen_queue,
   2665                                       uint64_t perm, uint64_t shared,
   2666                                       uint64_t *nperm, uint64_t *nshared)
   2667 {
   2668     GLOBAL_STATE_CODE();
   2669     *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
   2670     *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED;
   2671 }
   2672 
   2673 static void bdrv_default_perms_for_cow(BlockDriverState *bs, BdrvChild *c,
   2674                                        BdrvChildRole role,
   2675                                        BlockReopenQueue *reopen_queue,
   2676                                        uint64_t perm, uint64_t shared,
   2677                                        uint64_t *nperm, uint64_t *nshared)
   2678 {
   2679     assert(role & BDRV_CHILD_COW);
   2680     GLOBAL_STATE_CODE();
   2681 
   2682     /*
   2683      * We want consistent read from backing files if the parent needs it.
   2684      * No other operations are performed on backing files.
   2685      */
   2686     perm &= BLK_PERM_CONSISTENT_READ;
   2687 
   2688     /*
   2689      * If the parent can deal with changing data, we're okay with a
   2690      * writable and resizable backing file.
   2691      * TODO Require !(perm & BLK_PERM_CONSISTENT_READ), too?
   2692      */
   2693     if (shared & BLK_PERM_WRITE) {
   2694         shared = BLK_PERM_WRITE | BLK_PERM_RESIZE;
   2695     } else {
   2696         shared = 0;
   2697     }
   2698 
   2699     shared |= BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
   2700 
   2701     if (bs->open_flags & BDRV_O_INACTIVE) {
   2702         shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
   2703     }
   2704 
   2705     *nperm = perm;
   2706     *nshared = shared;
   2707 }
   2708 
   2709 static void bdrv_default_perms_for_storage(BlockDriverState *bs, BdrvChild *c,
   2710                                            BdrvChildRole role,
   2711                                            BlockReopenQueue *reopen_queue,
   2712                                            uint64_t perm, uint64_t shared,
   2713                                            uint64_t *nperm, uint64_t *nshared)
   2714 {
   2715     int flags;
   2716 
   2717     GLOBAL_STATE_CODE();
   2718     assert(role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA));
   2719 
   2720     flags = bdrv_reopen_get_flags(reopen_queue, bs);
   2721 
   2722     /*
   2723      * Apart from the modifications below, the same permissions are
   2724      * forwarded and left alone as for filters
   2725      */
   2726     bdrv_filter_default_perms(bs, c, role, reopen_queue,
   2727                               perm, shared, &perm, &shared);
   2728 
   2729     if (role & BDRV_CHILD_METADATA) {
   2730         /* Format drivers may touch metadata even if the guest doesn't write */
   2731         if (bdrv_is_writable_after_reopen(bs, reopen_queue)) {
   2732             perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
   2733         }
   2734 
   2735         /*
   2736          * bs->file always needs to be consistent because of the
   2737          * metadata. We can never allow other users to resize or write
   2738          * to it.
   2739          */
   2740         if (!(flags & BDRV_O_NO_IO)) {
   2741             perm |= BLK_PERM_CONSISTENT_READ;
   2742         }
   2743         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
   2744     }
   2745 
   2746     if (role & BDRV_CHILD_DATA) {
   2747         /*
   2748          * Technically, everything in this block is a subset of the
   2749          * BDRV_CHILD_METADATA path taken above, and so this could
   2750          * be an "else if" branch.  However, that is not obvious, and
   2751          * this function is not performance critical, therefore we let
   2752          * this be an independent "if".
   2753          */
   2754 
   2755         /*
   2756          * We cannot allow other users to resize the file because the
   2757          * format driver might have some assumptions about the size
   2758          * (e.g. because it is stored in metadata, or because the file
   2759          * is split into fixed-size data files).
   2760          */
   2761         shared &= ~BLK_PERM_RESIZE;
   2762 
   2763         /*
   2764          * WRITE_UNCHANGED often cannot be performed as such on the
   2765          * data file.  For example, the qcow2 driver may still need to
   2766          * write copied clusters on copy-on-read.
   2767          */
   2768         if (perm & BLK_PERM_WRITE_UNCHANGED) {
   2769             perm |= BLK_PERM_WRITE;
   2770         }
   2771 
   2772         /*
   2773          * If the data file is written to, the format driver may
   2774          * expect to be able to resize it by writing beyond the EOF.
   2775          */
   2776         if (perm & BLK_PERM_WRITE) {
   2777             perm |= BLK_PERM_RESIZE;
   2778         }
   2779     }
   2780 
   2781     if (bs->open_flags & BDRV_O_INACTIVE) {
   2782         shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
   2783     }
   2784 
   2785     *nperm = perm;
   2786     *nshared = shared;
   2787 }
   2788 
   2789 void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c,
   2790                         BdrvChildRole role, BlockReopenQueue *reopen_queue,
   2791                         uint64_t perm, uint64_t shared,
   2792                         uint64_t *nperm, uint64_t *nshared)
   2793 {
   2794     GLOBAL_STATE_CODE();
   2795     if (role & BDRV_CHILD_FILTERED) {
   2796         assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
   2797                          BDRV_CHILD_COW)));
   2798         bdrv_filter_default_perms(bs, c, role, reopen_queue,
   2799                                   perm, shared, nperm, nshared);
   2800     } else if (role & BDRV_CHILD_COW) {
   2801         assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA)));
   2802         bdrv_default_perms_for_cow(bs, c, role, reopen_queue,
   2803                                    perm, shared, nperm, nshared);
   2804     } else if (role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA)) {
   2805         bdrv_default_perms_for_storage(bs, c, role, reopen_queue,
   2806                                        perm, shared, nperm, nshared);
   2807     } else {
   2808         g_assert_not_reached();
   2809     }
   2810 }
   2811 
   2812 uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
   2813 {
   2814     static const uint64_t permissions[] = {
   2815         [BLOCK_PERMISSION_CONSISTENT_READ]  = BLK_PERM_CONSISTENT_READ,
   2816         [BLOCK_PERMISSION_WRITE]            = BLK_PERM_WRITE,
   2817         [BLOCK_PERMISSION_WRITE_UNCHANGED]  = BLK_PERM_WRITE_UNCHANGED,
   2818         [BLOCK_PERMISSION_RESIZE]           = BLK_PERM_RESIZE,
   2819     };
   2820 
   2821     QEMU_BUILD_BUG_ON(ARRAY_SIZE(permissions) != BLOCK_PERMISSION__MAX);
   2822     QEMU_BUILD_BUG_ON(1UL << ARRAY_SIZE(permissions) != BLK_PERM_ALL + 1);
   2823 
   2824     assert(qapi_perm < BLOCK_PERMISSION__MAX);
   2825 
   2826     return permissions[qapi_perm];
   2827 }
   2828 
   2829 static void bdrv_replace_child_noperm(BdrvChild *child,
   2830                                       BlockDriverState *new_bs)
   2831 {
   2832     BlockDriverState *old_bs = child->bs;
   2833     int new_bs_quiesce_counter;
   2834     int drain_saldo;
   2835 
   2836     assert(!child->frozen);
   2837     assert(old_bs != new_bs);
   2838     GLOBAL_STATE_CODE();
   2839 
   2840     if (old_bs && new_bs) {
   2841         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
   2842     }
   2843 
   2844     new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
   2845     drain_saldo = new_bs_quiesce_counter - child->parent_quiesce_counter;
   2846 
   2847     /*
   2848      * If the new child node is drained but the old one was not, flush
   2849      * all outstanding requests to the old child node.
   2850      */
   2851     while (drain_saldo > 0 && child->klass->drained_begin) {
   2852         bdrv_parent_drained_begin_single(child, true);
   2853         drain_saldo--;
   2854     }
   2855 
   2856     if (old_bs) {
   2857         /* Detach first so that the recursive drain sections coming from @child
   2858          * are already gone and we only end the drain sections that came from
   2859          * elsewhere. */
   2860         if (child->klass->detach) {
   2861             child->klass->detach(child);
   2862         }
   2863         assert_bdrv_graph_writable(old_bs);
   2864         QLIST_REMOVE(child, next_parent);
   2865     }
   2866 
   2867     child->bs = new_bs;
   2868 
   2869     if (new_bs) {
   2870         assert_bdrv_graph_writable(new_bs);
   2871         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
   2872 
   2873         /*
   2874          * Detaching the old node may have led to the new node's
   2875          * quiesce_counter having been decreased.  Not a problem, we
   2876          * just need to recognize this here and then invoke
   2877          * drained_end appropriately more often.
   2878          */
   2879         assert(new_bs->quiesce_counter <= new_bs_quiesce_counter);
   2880         drain_saldo += new_bs->quiesce_counter - new_bs_quiesce_counter;
   2881 
   2882         /* Attach only after starting new drained sections, so that recursive
   2883          * drain sections coming from @child don't get an extra .drained_begin
   2884          * callback. */
   2885         if (child->klass->attach) {
   2886             child->klass->attach(child);
   2887         }
   2888     }
   2889 
   2890     /*
   2891      * If the old child node was drained but the new one is not, allow
   2892      * requests to come in only after the new node has been attached.
   2893      */
   2894     while (drain_saldo < 0 && child->klass->drained_end) {
   2895         bdrv_parent_drained_end_single(child);
   2896         drain_saldo++;
   2897     }
   2898 }
   2899 
   2900 /**
   2901  * Free the given @child.
   2902  *
   2903  * The child must be empty (i.e. `child->bs == NULL`) and it must be
   2904  * unused (i.e. not in a children list).
   2905  */
   2906 static void bdrv_child_free(BdrvChild *child)
   2907 {
   2908     assert(!child->bs);
   2909     GLOBAL_STATE_CODE();
   2910     assert(!child->next.le_prev); /* not in children list */
   2911 
   2912     g_free(child->name);
   2913     g_free(child);
   2914 }
   2915 
   2916 typedef struct BdrvAttachChildCommonState {
   2917     BdrvChild *child;
   2918     AioContext *old_parent_ctx;
   2919     AioContext *old_child_ctx;
   2920 } BdrvAttachChildCommonState;
   2921 
   2922 static void bdrv_attach_child_common_abort(void *opaque)
   2923 {
   2924     BdrvAttachChildCommonState *s = opaque;
   2925     BlockDriverState *bs = s->child->bs;
   2926 
   2927     GLOBAL_STATE_CODE();
   2928     bdrv_replace_child_noperm(s->child, NULL);
   2929 
   2930     if (bdrv_get_aio_context(bs) != s->old_child_ctx) {
   2931         bdrv_try_change_aio_context(bs, s->old_child_ctx, NULL, &error_abort);
   2932     }
   2933 
   2934     if (bdrv_child_get_parent_aio_context(s->child) != s->old_parent_ctx) {
   2935         Transaction *tran;
   2936         GHashTable *visited;
   2937         bool ret;
   2938 
   2939         tran = tran_new();
   2940 
   2941         /* No need to visit `child`, because it has been detached already */
   2942         visited = g_hash_table_new(NULL, NULL);
   2943         ret = s->child->klass->change_aio_ctx(s->child, s->old_parent_ctx,
   2944                                               visited, tran, &error_abort);
   2945         g_hash_table_destroy(visited);
   2946 
   2947         /* transaction is supposed to always succeed */
   2948         assert(ret == true);
   2949         tran_commit(tran);
   2950     }
   2951 
   2952     bdrv_unref(bs);
   2953     bdrv_child_free(s->child);
   2954 }
   2955 
   2956 static TransactionActionDrv bdrv_attach_child_common_drv = {
   2957     .abort = bdrv_attach_child_common_abort,
   2958     .clean = g_free,
   2959 };
   2960 
   2961 /*
   2962  * Common part of attaching bdrv child to bs or to blk or to job
   2963  *
   2964  * Function doesn't update permissions, caller is responsible for this.
   2965  *
   2966  * Returns new created child.
   2967  */
   2968 static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
   2969                                            const char *child_name,
   2970                                            const BdrvChildClass *child_class,
   2971                                            BdrvChildRole child_role,
   2972                                            uint64_t perm, uint64_t shared_perm,
   2973                                            void *opaque,
   2974                                            Transaction *tran, Error **errp)
   2975 {
   2976     BdrvChild *new_child;
   2977     AioContext *parent_ctx;
   2978     AioContext *child_ctx = bdrv_get_aio_context(child_bs);
   2979 
   2980     assert(child_class->get_parent_desc);
   2981     GLOBAL_STATE_CODE();
   2982 
   2983     new_child = g_new(BdrvChild, 1);
   2984     *new_child = (BdrvChild) {
   2985         .bs             = NULL,
   2986         .name           = g_strdup(child_name),
   2987         .klass          = child_class,
   2988         .role           = child_role,
   2989         .perm           = perm,
   2990         .shared_perm    = shared_perm,
   2991         .opaque         = opaque,
   2992     };
   2993 
   2994     /*
   2995      * If the AioContexts don't match, first try to move the subtree of
   2996      * child_bs into the AioContext of the new parent. If this doesn't work,
   2997      * try moving the parent into the AioContext of child_bs instead.
   2998      */
   2999     parent_ctx = bdrv_child_get_parent_aio_context(new_child);
   3000     if (child_ctx != parent_ctx) {
   3001         Error *local_err = NULL;
   3002         int ret = bdrv_try_change_aio_context(child_bs, parent_ctx, NULL,
   3003                                               &local_err);
   3004 
   3005         if (ret < 0 && child_class->change_aio_ctx) {
   3006             Transaction *tran = tran_new();
   3007             GHashTable *visited = g_hash_table_new(NULL, NULL);
   3008             bool ret_child;
   3009 
   3010             g_hash_table_add(visited, new_child);
   3011             ret_child = child_class->change_aio_ctx(new_child, child_ctx,
   3012                                                     visited, tran, NULL);
   3013             if (ret_child == true) {
   3014                 error_free(local_err);
   3015                 ret = 0;
   3016             }
   3017             tran_finalize(tran, ret_child == true ? 0 : -1);
   3018             g_hash_table_destroy(visited);
   3019         }
   3020 
   3021         if (ret < 0) {
   3022             error_propagate(errp, local_err);
   3023             bdrv_child_free(new_child);
   3024             return NULL;
   3025         }
   3026     }
   3027 
   3028     bdrv_ref(child_bs);
   3029     bdrv_replace_child_noperm(new_child, child_bs);
   3030 
   3031     BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
   3032     *s = (BdrvAttachChildCommonState) {
   3033         .child = new_child,
   3034         .old_parent_ctx = parent_ctx,
   3035         .old_child_ctx = child_ctx,
   3036     };
   3037     tran_add(tran, &bdrv_attach_child_common_drv, s);
   3038 
   3039     return new_child;
   3040 }
   3041 
   3042 /*
   3043  * Function doesn't update permissions, caller is responsible for this.
   3044  */
   3045 static BdrvChild *bdrv_attach_child_noperm(BlockDriverState *parent_bs,
   3046                                            BlockDriverState *child_bs,
   3047                                            const char *child_name,
   3048                                            const BdrvChildClass *child_class,
   3049                                            BdrvChildRole child_role,
   3050                                            Transaction *tran,
   3051                                            Error **errp)
   3052 {
   3053     uint64_t perm, shared_perm;
   3054 
   3055     assert(parent_bs->drv);
   3056     GLOBAL_STATE_CODE();
   3057 
   3058     if (bdrv_recurse_has_child(child_bs, parent_bs)) {
   3059         error_setg(errp, "Making '%s' a %s child of '%s' would create a cycle",
   3060                    child_bs->node_name, child_name, parent_bs->node_name);
   3061         return NULL;
   3062     }
   3063 
   3064     bdrv_get_cumulative_perm(parent_bs, &perm, &shared_perm);
   3065     bdrv_child_perm(parent_bs, child_bs, NULL, child_role, NULL,
   3066                     perm, shared_perm, &perm, &shared_perm);
   3067 
   3068     return bdrv_attach_child_common(child_bs, child_name, child_class,
   3069                                     child_role, perm, shared_perm, parent_bs,
   3070                                     tran, errp);
   3071 }
   3072 
   3073 static void bdrv_detach_child(BdrvChild *child)
   3074 {
   3075     BlockDriverState *old_bs = child->bs;
   3076 
   3077     GLOBAL_STATE_CODE();
   3078     bdrv_replace_child_noperm(child, NULL);
   3079     bdrv_child_free(child);
   3080 
   3081     if (old_bs) {
   3082         /*
   3083          * Update permissions for old node. We're just taking a parent away, so
   3084          * we're loosening restrictions. Errors of permission update are not
   3085          * fatal in this case, ignore them.
   3086          */
   3087         bdrv_refresh_perms(old_bs, NULL);
   3088 
   3089         /*
   3090          * When the parent requiring a non-default AioContext is removed, the
   3091          * node moves back to the main AioContext
   3092          */
   3093         bdrv_try_change_aio_context(old_bs, qemu_get_aio_context(), NULL, NULL);
   3094     }
   3095 }
   3096 
   3097 /*
   3098  * This function steals the reference to child_bs from the caller.
   3099  * That reference is later dropped by bdrv_root_unref_child().
   3100  *
   3101  * On failure NULL is returned, errp is set and the reference to
   3102  * child_bs is also dropped.
   3103  *
   3104  * The caller must hold the AioContext lock @child_bs, but not that of @ctx
   3105  * (unless @child_bs is already in @ctx).
   3106  */
   3107 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
   3108                                   const char *child_name,
   3109                                   const BdrvChildClass *child_class,
   3110                                   BdrvChildRole child_role,
   3111                                   uint64_t perm, uint64_t shared_perm,
   3112                                   void *opaque, Error **errp)
   3113 {
   3114     int ret;
   3115     BdrvChild *child;
   3116     Transaction *tran = tran_new();
   3117 
   3118     GLOBAL_STATE_CODE();
   3119 
   3120     child = bdrv_attach_child_common(child_bs, child_name, child_class,
   3121                                    child_role, perm, shared_perm, opaque,
   3122                                    tran, errp);
   3123     if (!child) {
   3124         ret = -EINVAL;
   3125         goto out;
   3126     }
   3127 
   3128     ret = bdrv_refresh_perms(child_bs, errp);
   3129 
   3130 out:
   3131     tran_finalize(tran, ret);
   3132 
   3133     bdrv_unref(child_bs);
   3134 
   3135     return ret < 0 ? NULL : child;
   3136 }
   3137 
   3138 /*
   3139  * This function transfers the reference to child_bs from the caller
   3140  * to parent_bs. That reference is later dropped by parent_bs on
   3141  * bdrv_close() or if someone calls bdrv_unref_child().
   3142  *
   3143  * On failure NULL is returned, errp is set and the reference to
   3144  * child_bs is also dropped.
   3145  *
   3146  * If @parent_bs and @child_bs are in different AioContexts, the caller must
   3147  * hold the AioContext lock for @child_bs, but not for @parent_bs.
   3148  */
   3149 BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
   3150                              BlockDriverState *child_bs,
   3151                              const char *child_name,
   3152                              const BdrvChildClass *child_class,
   3153                              BdrvChildRole child_role,
   3154                              Error **errp)
   3155 {
   3156     int ret;
   3157     BdrvChild *child;
   3158     Transaction *tran = tran_new();
   3159 
   3160     GLOBAL_STATE_CODE();
   3161 
   3162     child = bdrv_attach_child_noperm(parent_bs, child_bs, child_name,
   3163                                      child_class, child_role, tran, errp);
   3164     if (!child) {
   3165         ret = -EINVAL;
   3166         goto out;
   3167     }
   3168 
   3169     ret = bdrv_refresh_perms(parent_bs, errp);
   3170     if (ret < 0) {
   3171         goto out;
   3172     }
   3173 
   3174 out:
   3175     tran_finalize(tran, ret);
   3176 
   3177     bdrv_unref(child_bs);
   3178 
   3179     return ret < 0 ? NULL : child;
   3180 }
   3181 
   3182 /* Callers must ensure that child->frozen is false. */
   3183 void bdrv_root_unref_child(BdrvChild *child)
   3184 {
   3185     BlockDriverState *child_bs;
   3186 
   3187     GLOBAL_STATE_CODE();
   3188 
   3189     child_bs = child->bs;
   3190     bdrv_detach_child(child);
   3191     bdrv_unref(child_bs);
   3192 }
   3193 
   3194 typedef struct BdrvSetInheritsFrom {
   3195     BlockDriverState *bs;
   3196     BlockDriverState *old_inherits_from;
   3197 } BdrvSetInheritsFrom;
   3198 
   3199 static void bdrv_set_inherits_from_abort(void *opaque)
   3200 {
   3201     BdrvSetInheritsFrom *s = opaque;
   3202 
   3203     s->bs->inherits_from = s->old_inherits_from;
   3204 }
   3205 
   3206 static TransactionActionDrv bdrv_set_inherits_from_drv = {
   3207     .abort = bdrv_set_inherits_from_abort,
   3208     .clean = g_free,
   3209 };
   3210 
   3211 /* @tran is allowed to be NULL. In this case no rollback is possible */
   3212 static void bdrv_set_inherits_from(BlockDriverState *bs,
   3213                                    BlockDriverState *new_inherits_from,
   3214                                    Transaction *tran)
   3215 {
   3216     if (tran) {
   3217         BdrvSetInheritsFrom *s = g_new(BdrvSetInheritsFrom, 1);
   3218 
   3219         *s = (BdrvSetInheritsFrom) {
   3220             .bs = bs,
   3221             .old_inherits_from = bs->inherits_from,
   3222         };
   3223 
   3224         tran_add(tran, &bdrv_set_inherits_from_drv, s);
   3225     }
   3226 
   3227     bs->inherits_from = new_inherits_from;
   3228 }
   3229 
   3230 /**
   3231  * Clear all inherits_from pointers from children and grandchildren of
   3232  * @root that point to @root, where necessary.
   3233  * @tran is allowed to be NULL. In this case no rollback is possible
   3234  */
   3235 static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
   3236                                      Transaction *tran)
   3237 {
   3238     BdrvChild *c;
   3239 
   3240     if (child->bs->inherits_from == root) {
   3241         /*
   3242          * Remove inherits_from only when the last reference between root and
   3243          * child->bs goes away.
   3244          */
   3245         QLIST_FOREACH(c, &root->children, next) {
   3246             if (c != child && c->bs == child->bs) {
   3247                 break;
   3248             }
   3249         }
   3250         if (c == NULL) {
   3251             bdrv_set_inherits_from(child->bs, NULL, tran);
   3252         }
   3253     }
   3254 
   3255     QLIST_FOREACH(c, &child->bs->children, next) {
   3256         bdrv_unset_inherits_from(root, c, tran);
   3257     }
   3258 }
   3259 
   3260 /* Callers must ensure that child->frozen is false. */
   3261 void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
   3262 {
   3263     GLOBAL_STATE_CODE();
   3264     if (child == NULL) {
   3265         return;
   3266     }
   3267 
   3268     bdrv_unset_inherits_from(parent, child, NULL);
   3269     bdrv_root_unref_child(child);
   3270 }
   3271 
   3272 
   3273 static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
   3274 {
   3275     BdrvChild *c;
   3276     GLOBAL_STATE_CODE();
   3277     QLIST_FOREACH(c, &bs->parents, next_parent) {
   3278         if (c->klass->change_media) {
   3279             c->klass->change_media(c, load);
   3280         }
   3281     }
   3282 }
   3283 
   3284 /* Return true if you can reach parent going through child->inherits_from
   3285  * recursively. If parent or child are NULL, return false */
   3286 static bool bdrv_inherits_from_recursive(BlockDriverState *child,
   3287                                          BlockDriverState *parent)
   3288 {
   3289     while (child && child != parent) {
   3290         child = child->inherits_from;
   3291     }
   3292 
   3293     return child != NULL;
   3294 }
   3295 
   3296 /*
   3297  * Return the BdrvChildRole for @bs's backing child.  bs->backing is
   3298  * mostly used for COW backing children (role = COW), but also for
   3299  * filtered children (role = FILTERED | PRIMARY).
   3300  */
   3301 static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
   3302 {
   3303     if (bs->drv && bs->drv->is_filter) {
   3304         return BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
   3305     } else {
   3306         return BDRV_CHILD_COW;
   3307     }
   3308 }
   3309 
   3310 /*
   3311  * Sets the bs->backing or bs->file link of a BDS. A new reference is created;
   3312  * callers which don't need their own reference any more must call bdrv_unref().
   3313  *
   3314  * Function doesn't update permissions, caller is responsible for this.
   3315  */
   3316 static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
   3317                                            BlockDriverState *child_bs,
   3318                                            bool is_backing,
   3319                                            Transaction *tran, Error **errp)
   3320 {
   3321     bool update_inherits_from =
   3322         bdrv_inherits_from_recursive(child_bs, parent_bs);
   3323     BdrvChild *child = is_backing ? parent_bs->backing : parent_bs->file;
   3324     BdrvChildRole role;
   3325 
   3326     GLOBAL_STATE_CODE();
   3327 
   3328     if (!parent_bs->drv) {
   3329         /*
   3330          * Node without drv is an object without a class :/. TODO: finally fix
   3331          * qcow2 driver to never clear bs->drv and implement format corruption
   3332          * handling in other way.
   3333          */
   3334         error_setg(errp, "Node corrupted");
   3335         return -EINVAL;
   3336     }
   3337 
   3338     if (child && child->frozen) {
   3339         error_setg(errp, "Cannot change frozen '%s' link from '%s' to '%s'",
   3340                    child->name, parent_bs->node_name, child->bs->node_name);
   3341         return -EPERM;
   3342     }
   3343 
   3344     if (is_backing && !parent_bs->drv->is_filter &&
   3345         !parent_bs->drv->supports_backing)
   3346     {
   3347         error_setg(errp, "Driver '%s' of node '%s' does not support backing "
   3348                    "files", parent_bs->drv->format_name, parent_bs->node_name);
   3349         return -EINVAL;
   3350     }
   3351 
   3352     if (parent_bs->drv->is_filter) {
   3353         role = BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
   3354     } else if (is_backing) {
   3355         role = BDRV_CHILD_COW;
   3356     } else {
   3357         /*
   3358          * We only can use same role as it is in existing child. We don't have
   3359          * infrastructure to determine role of file child in generic way
   3360          */
   3361         if (!child) {
   3362             error_setg(errp, "Cannot set file child to format node without "
   3363                        "file child");
   3364             return -EINVAL;
   3365         }
   3366         role = child->role;
   3367     }
   3368 
   3369     if (child) {
   3370         bdrv_unset_inherits_from(parent_bs, child, tran);
   3371         bdrv_remove_child(child, tran);
   3372     }
   3373 
   3374     if (!child_bs) {
   3375         goto out;
   3376     }
   3377 
   3378     child = bdrv_attach_child_noperm(parent_bs, child_bs,
   3379                                      is_backing ? "backing" : "file",
   3380                                      &child_of_bds, role,
   3381                                      tran, errp);
   3382     if (!child) {
   3383         return -EINVAL;
   3384     }
   3385 
   3386 
   3387     /*
   3388      * If inherits_from pointed recursively to bs then let's update it to
   3389      * point directly to bs (else it will become NULL).
   3390      */
   3391     if (update_inherits_from) {
   3392         bdrv_set_inherits_from(child_bs, parent_bs, tran);
   3393     }
   3394 
   3395 out:
   3396     bdrv_refresh_limits(parent_bs, tran, NULL);
   3397 
   3398     return 0;
   3399 }
   3400 
   3401 static int bdrv_set_backing_noperm(BlockDriverState *bs,
   3402                                    BlockDriverState *backing_hd,
   3403                                    Transaction *tran, Error **errp)
   3404 {
   3405     GLOBAL_STATE_CODE();
   3406     return bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp);
   3407 }
   3408 
   3409 int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
   3410                         Error **errp)
   3411 {
   3412     int ret;
   3413     Transaction *tran = tran_new();
   3414 
   3415     GLOBAL_STATE_CODE();
   3416     bdrv_drained_begin(bs);
   3417 
   3418     ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
   3419     if (ret < 0) {
   3420         goto out;
   3421     }
   3422 
   3423     ret = bdrv_refresh_perms(bs, errp);
   3424 out:
   3425     tran_finalize(tran, ret);
   3426 
   3427     bdrv_drained_end(bs);
   3428 
   3429     return ret;
   3430 }
   3431 
   3432 /*
   3433  * Opens the backing file for a BlockDriverState if not yet open
   3434  *
   3435  * bdref_key specifies the key for the image's BlockdevRef in the options QDict.
   3436  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
   3437  * itself, all options starting with "${bdref_key}." are considered part of the
   3438  * BlockdevRef.
   3439  *
   3440  * TODO Can this be unified with bdrv_open_image()?
   3441  */
   3442 int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
   3443                            const char *bdref_key, Error **errp)
   3444 {
   3445     char *backing_filename = NULL;
   3446     char *bdref_key_dot;
   3447     const char *reference = NULL;
   3448     int ret = 0;
   3449     bool implicit_backing = false;
   3450     BlockDriverState *backing_hd;
   3451     QDict *options;
   3452     QDict *tmp_parent_options = NULL;
   3453     Error *local_err = NULL;
   3454 
   3455     GLOBAL_STATE_CODE();
   3456 
   3457     if (bs->backing != NULL) {
   3458         goto free_exit;
   3459     }
   3460 
   3461     /* NULL means an empty set of options */
   3462     if (parent_options == NULL) {
   3463         tmp_parent_options = qdict_new();
   3464         parent_options = tmp_parent_options;
   3465     }
   3466 
   3467     bs->open_flags &= ~BDRV_O_NO_BACKING;
   3468 
   3469     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
   3470     qdict_extract_subqdict(parent_options, &options, bdref_key_dot);
   3471     g_free(bdref_key_dot);
   3472 
   3473     /*
   3474      * Caution: while qdict_get_try_str() is fine, getting non-string
   3475      * types would require more care.  When @parent_options come from
   3476      * -blockdev or blockdev_add, its members are typed according to
   3477      * the QAPI schema, but when they come from -drive, they're all
   3478      * QString.
   3479      */
   3480     reference = qdict_get_try_str(parent_options, bdref_key);
   3481     if (reference || qdict_haskey(options, "file.filename")) {
   3482         /* keep backing_filename NULL */
   3483     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
   3484         qobject_unref(options);
   3485         goto free_exit;
   3486     } else {
   3487         if (qdict_size(options) == 0) {
   3488             /* If the user specifies options that do not modify the
   3489              * backing file's behavior, we might still consider it the
   3490              * implicit backing file.  But it's easier this way, and
   3491              * just specifying some of the backing BDS's options is
   3492              * only possible with -drive anyway (otherwise the QAPI
   3493              * schema forces the user to specify everything). */
   3494             implicit_backing = !strcmp(bs->auto_backing_file, bs->backing_file);
   3495         }
   3496 
   3497         backing_filename = bdrv_get_full_backing_filename(bs, &local_err);
   3498         if (local_err) {
   3499             ret = -EINVAL;
   3500             error_propagate(errp, local_err);
   3501             qobject_unref(options);
   3502             goto free_exit;
   3503         }
   3504     }
   3505 
   3506     if (!bs->drv || !bs->drv->supports_backing) {
   3507         ret = -EINVAL;
   3508         error_setg(errp, "Driver doesn't support backing files");
   3509         qobject_unref(options);
   3510         goto free_exit;
   3511     }
   3512 
   3513     if (!reference &&
   3514         bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
   3515         qdict_put_str(options, "driver", bs->backing_format);
   3516     }
   3517 
   3518     backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs,
   3519                                    &child_of_bds, bdrv_backing_role(bs), errp);
   3520     if (!backing_hd) {
   3521         bs->open_flags |= BDRV_O_NO_BACKING;
   3522         error_prepend(errp, "Could not open backing file: ");
   3523         ret = -EINVAL;
   3524         goto free_exit;
   3525     }
   3526 
   3527     if (implicit_backing) {
   3528         bdrv_refresh_filename(backing_hd);
   3529         pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
   3530                 backing_hd->filename);
   3531     }
   3532 
   3533     /* Hook up the backing file link; drop our reference, bs owns the
   3534      * backing_hd reference now */
   3535     ret = bdrv_set_backing_hd(bs, backing_hd, errp);
   3536     bdrv_unref(backing_hd);
   3537     if (ret < 0) {
   3538         goto free_exit;
   3539     }
   3540 
   3541     qdict_del(parent_options, bdref_key);
   3542 
   3543 free_exit:
   3544     g_free(backing_filename);
   3545     qobject_unref(tmp_parent_options);
   3546     return ret;
   3547 }
   3548 
   3549 static BlockDriverState *
   3550 bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
   3551                    BlockDriverState *parent, const BdrvChildClass *child_class,
   3552                    BdrvChildRole child_role, bool allow_none, Error **errp)
   3553 {
   3554     BlockDriverState *bs = NULL;
   3555     QDict *image_options;
   3556     char *bdref_key_dot;
   3557     const char *reference;
   3558 
   3559     assert(child_class != NULL);
   3560 
   3561     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
   3562     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
   3563     g_free(bdref_key_dot);
   3564 
   3565     /*
   3566      * Caution: while qdict_get_try_str() is fine, getting non-string
   3567      * types would require more care.  When @options come from
   3568      * -blockdev or blockdev_add, its members are typed according to
   3569      * the QAPI schema, but when they come from -drive, they're all
   3570      * QString.
   3571      */
   3572     reference = qdict_get_try_str(options, bdref_key);
   3573     if (!filename && !reference && !qdict_size(image_options)) {
   3574         if (!allow_none) {
   3575             error_setg(errp, "A block device must be specified for \"%s\"",
   3576                        bdref_key);
   3577         }
   3578         qobject_unref(image_options);
   3579         goto done;
   3580     }
   3581 
   3582     bs = bdrv_open_inherit(filename, reference, image_options, 0,
   3583                            parent, child_class, child_role, errp);
   3584     if (!bs) {
   3585         goto done;
   3586     }
   3587 
   3588 done:
   3589     qdict_del(options, bdref_key);
   3590     return bs;
   3591 }
   3592 
   3593 /*
   3594  * Opens a disk image whose options are given as BlockdevRef in another block
   3595  * device's options.
   3596  *
   3597  * If allow_none is true, no image will be opened if filename is false and no
   3598  * BlockdevRef is given. NULL will be returned, but errp remains unset.
   3599  *
   3600  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
   3601  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
   3602  * itself, all options starting with "${bdref_key}." are considered part of the
   3603  * BlockdevRef.
   3604  *
   3605  * The BlockdevRef will be removed from the options QDict.
   3606  */
   3607 BdrvChild *bdrv_open_child(const char *filename,
   3608                            QDict *options, const char *bdref_key,
   3609                            BlockDriverState *parent,
   3610                            const BdrvChildClass *child_class,
   3611                            BdrvChildRole child_role,
   3612                            bool allow_none, Error **errp)
   3613 {
   3614     BlockDriverState *bs;
   3615 
   3616     GLOBAL_STATE_CODE();
   3617 
   3618     bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class,
   3619                             child_role, allow_none, errp);
   3620     if (bs == NULL) {
   3621         return NULL;
   3622     }
   3623 
   3624     return bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
   3625                              errp);
   3626 }
   3627 
   3628 /*
   3629  * Wrapper on bdrv_open_child() for most popular case: open primary child of bs.
   3630  */
   3631 int bdrv_open_file_child(const char *filename,
   3632                          QDict *options, const char *bdref_key,
   3633                          BlockDriverState *parent, Error **errp)
   3634 {
   3635     BdrvChildRole role;
   3636 
   3637     /* commit_top and mirror_top don't use this function */
   3638     assert(!parent->drv->filtered_child_is_backing);
   3639     role = parent->drv->is_filter ?
   3640         (BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY) : BDRV_CHILD_IMAGE;
   3641 
   3642     if (!bdrv_open_child(filename, options, bdref_key, parent,
   3643                          &child_of_bds, role, false, errp))
   3644     {
   3645         return -EINVAL;
   3646     }
   3647 
   3648     return 0;
   3649 }
   3650 
   3651 /*
   3652  * TODO Future callers may need to specify parent/child_class in order for
   3653  * option inheritance to work. Existing callers use it for the root node.
   3654  */
   3655 BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
   3656 {
   3657     BlockDriverState *bs = NULL;
   3658     QObject *obj = NULL;
   3659     QDict *qdict = NULL;
   3660     const char *reference = NULL;
   3661     Visitor *v = NULL;
   3662 
   3663     GLOBAL_STATE_CODE();
   3664 
   3665     if (ref->type == QTYPE_QSTRING) {
   3666         reference = ref->u.reference;
   3667     } else {
   3668         BlockdevOptions *options = &ref->u.definition;
   3669         assert(ref->type == QTYPE_QDICT);
   3670 
   3671         v = qobject_output_visitor_new(&obj);
   3672         visit_type_BlockdevOptions(v, NULL, &options, &error_abort);
   3673         visit_complete(v, &obj);
   3674 
   3675         qdict = qobject_to(QDict, obj);
   3676         qdict_flatten(qdict);
   3677 
   3678         /* bdrv_open_inherit() defaults to the values in bdrv_flags (for
   3679          * compatibility with other callers) rather than what we want as the
   3680          * real defaults. Apply the defaults here instead. */
   3681         qdict_set_default_str(qdict, BDRV_OPT_CACHE_DIRECT, "off");
   3682         qdict_set_default_str(qdict, BDRV_OPT_CACHE_NO_FLUSH, "off");
   3683         qdict_set_default_str(qdict, BDRV_OPT_READ_ONLY, "off");
   3684         qdict_set_default_str(qdict, BDRV_OPT_AUTO_READ_ONLY, "off");
   3685 
   3686     }
   3687 
   3688     bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, errp);
   3689     obj = NULL;
   3690     qobject_unref(obj);
   3691     visit_free(v);
   3692     return bs;
   3693 }
   3694 
   3695 static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
   3696                                                    int flags,
   3697                                                    QDict *snapshot_options,
   3698                                                    Error **errp)
   3699 {
   3700     g_autofree char *tmp_filename = NULL;
   3701     int64_t total_size;
   3702     QemuOpts *opts = NULL;
   3703     BlockDriverState *bs_snapshot = NULL;
   3704     int ret;
   3705 
   3706     GLOBAL_STATE_CODE();
   3707 
   3708     /* if snapshot, we create a temporary backing file and open it
   3709        instead of opening 'filename' directly */
   3710 
   3711     /* Get the required size from the image */
   3712     total_size = bdrv_getlength(bs);
   3713     if (total_size < 0) {
   3714         error_setg_errno(errp, -total_size, "Could not get image size");
   3715         goto out;
   3716     }
   3717 
   3718     /* Create the temporary image */
   3719     tmp_filename = create_tmp_file(errp);
   3720     if (!tmp_filename) {
   3721         goto out;
   3722     }
   3723 
   3724     opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
   3725                             &error_abort);
   3726     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
   3727     ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, errp);
   3728     qemu_opts_del(opts);
   3729     if (ret < 0) {
   3730         error_prepend(errp, "Could not create temporary overlay '%s': ",
   3731                       tmp_filename);
   3732         goto out;
   3733     }
   3734 
   3735     /* Prepare options QDict for the temporary file */
   3736     qdict_put_str(snapshot_options, "file.driver", "file");
   3737     qdict_put_str(snapshot_options, "file.filename", tmp_filename);
   3738     qdict_put_str(snapshot_options, "driver", "qcow2");
   3739 
   3740     bs_snapshot = bdrv_open(NULL, NULL, snapshot_options, flags, errp);
   3741     snapshot_options = NULL;
   3742     if (!bs_snapshot) {
   3743         goto out;
   3744     }
   3745 
   3746     ret = bdrv_append(bs_snapshot, bs, errp);
   3747     if (ret < 0) {
   3748         bs_snapshot = NULL;
   3749         goto out;
   3750     }
   3751 
   3752 out:
   3753     qobject_unref(snapshot_options);
   3754     return bs_snapshot;
   3755 }
   3756 
   3757 /*
   3758  * Opens a disk image (raw, qcow2, vmdk, ...)
   3759  *
   3760  * options is a QDict of options to pass to the block drivers, or NULL for an
   3761  * empty set of options. The reference to the QDict belongs to the block layer
   3762  * after the call (even on failure), so if the caller intends to reuse the
   3763  * dictionary, it needs to use qobject_ref() before calling bdrv_open.
   3764  *
   3765  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
   3766  * If it is not NULL, the referenced BDS will be reused.
   3767  *
   3768  * The reference parameter may be used to specify an existing block device which
   3769  * should be opened. If specified, neither options nor a filename may be given,
   3770  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
   3771  */
   3772 static BlockDriverState *bdrv_open_inherit(const char *filename,
   3773                                            const char *reference,
   3774                                            QDict *options, int flags,
   3775                                            BlockDriverState *parent,
   3776                                            const BdrvChildClass *child_class,
   3777                                            BdrvChildRole child_role,
   3778                                            Error **errp)
   3779 {
   3780     int ret;
   3781     BlockBackend *file = NULL;
   3782     BlockDriverState *bs;
   3783     BlockDriver *drv = NULL;
   3784     BdrvChild *child;
   3785     const char *drvname;
   3786     const char *backing;
   3787     Error *local_err = NULL;
   3788     QDict *snapshot_options = NULL;
   3789     int snapshot_flags = 0;
   3790 
   3791     assert(!child_class || !flags);
   3792     assert(!child_class == !parent);
   3793     GLOBAL_STATE_CODE();
   3794 
   3795     if (reference) {
   3796         bool options_non_empty = options ? qdict_size(options) : false;
   3797         qobject_unref(options);
   3798 
   3799         if (filename || options_non_empty) {
   3800             error_setg(errp, "Cannot reference an existing block device with "
   3801                        "additional options or a new filename");
   3802             return NULL;
   3803         }
   3804 
   3805         bs = bdrv_lookup_bs(reference, reference, errp);
   3806         if (!bs) {
   3807             return NULL;
   3808         }
   3809 
   3810         bdrv_ref(bs);
   3811         return bs;
   3812     }
   3813 
   3814     bs = bdrv_new();
   3815 
   3816     /* NULL means an empty set of options */
   3817     if (options == NULL) {
   3818         options = qdict_new();
   3819     }
   3820 
   3821     /* json: syntax counts as explicit options, as if in the QDict */
   3822     parse_json_protocol(options, &filename, &local_err);
   3823     if (local_err) {
   3824         goto fail;
   3825     }
   3826 
   3827     bs->explicit_options = qdict_clone_shallow(options);
   3828 
   3829     if (child_class) {
   3830         bool parent_is_format;
   3831 
   3832         if (parent->drv) {
   3833             parent_is_format = parent->drv->is_format;
   3834         } else {
   3835             /*
   3836              * parent->drv is not set yet because this node is opened for
   3837              * (potential) format probing.  That means that @parent is going
   3838              * to be a format node.
   3839              */
   3840             parent_is_format = true;
   3841         }
   3842 
   3843         bs->inherits_from = parent;
   3844         child_class->inherit_options(child_role, parent_is_format,
   3845                                      &flags, options,
   3846                                      parent->open_flags, parent->options);
   3847     }
   3848 
   3849     ret = bdrv_fill_options(&options, filename, &flags, &local_err);
   3850     if (ret < 0) {
   3851         goto fail;
   3852     }
   3853 
   3854     /*
   3855      * Set the BDRV_O_RDWR and BDRV_O_ALLOW_RDWR flags.
   3856      * Caution: getting a boolean member of @options requires care.
   3857      * When @options come from -blockdev or blockdev_add, members are
   3858      * typed according to the QAPI schema, but when they come from
   3859      * -drive, they're all QString.
   3860      */
   3861     if (g_strcmp0(qdict_get_try_str(options, BDRV_OPT_READ_ONLY), "on") &&
   3862         !qdict_get_try_bool(options, BDRV_OPT_READ_ONLY, false)) {
   3863         flags |= (BDRV_O_RDWR | BDRV_O_ALLOW_RDWR);
   3864     } else {
   3865         flags &= ~BDRV_O_RDWR;
   3866     }
   3867 
   3868     if (flags & BDRV_O_SNAPSHOT) {
   3869         snapshot_options = qdict_new();
   3870         bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options,
   3871                                    flags, options);
   3872         /* Let bdrv_backing_options() override "read-only" */
   3873         qdict_del(options, BDRV_OPT_READ_ONLY);
   3874         bdrv_inherited_options(BDRV_CHILD_COW, true,
   3875                                &flags, options, flags, options);
   3876     }
   3877 
   3878     bs->open_flags = flags;
   3879     bs->options = options;
   3880     options = qdict_clone_shallow(options);
   3881 
   3882     /* Find the right image format driver */
   3883     /* See cautionary note on accessing @options above */
   3884     drvname = qdict_get_try_str(options, "driver");
   3885     if (drvname) {
   3886         drv = bdrv_find_format(drvname);
   3887         if (!drv) {
   3888             error_setg(errp, "Unknown driver: '%s'", drvname);
   3889             goto fail;
   3890         }
   3891     }
   3892 
   3893     assert(drvname || !(flags & BDRV_O_PROTOCOL));
   3894 
   3895     /* See cautionary note on accessing @options above */
   3896     backing = qdict_get_try_str(options, "backing");
   3897     if (qobject_to(QNull, qdict_get(options, "backing")) != NULL ||
   3898         (backing && *backing == '\0'))
   3899     {
   3900         if (backing) {
   3901             warn_report("Use of \"backing\": \"\" is deprecated; "
   3902                         "use \"backing\": null instead");
   3903         }
   3904         flags |= BDRV_O_NO_BACKING;
   3905         qdict_del(bs->explicit_options, "backing");
   3906         qdict_del(bs->options, "backing");
   3907         qdict_del(options, "backing");
   3908     }
   3909 
   3910     /* Open image file without format layer. This BlockBackend is only used for
   3911      * probing, the block drivers will do their own bdrv_open_child() for the
   3912      * same BDS, which is why we put the node name back into options. */
   3913     if ((flags & BDRV_O_PROTOCOL) == 0) {
   3914         BlockDriverState *file_bs;
   3915 
   3916         file_bs = bdrv_open_child_bs(filename, options, "file", bs,
   3917                                      &child_of_bds, BDRV_CHILD_IMAGE,
   3918                                      true, &local_err);
   3919         if (local_err) {
   3920             goto fail;
   3921         }
   3922         if (file_bs != NULL) {
   3923             /* Not requesting BLK_PERM_CONSISTENT_READ because we're only
   3924              * looking at the header to guess the image format. This works even
   3925              * in cases where a guest would not see a consistent state. */
   3926             file = blk_new(bdrv_get_aio_context(file_bs), 0, BLK_PERM_ALL);
   3927             blk_insert_bs(file, file_bs, &local_err);
   3928             bdrv_unref(file_bs);
   3929             if (local_err) {
   3930                 goto fail;
   3931             }
   3932 
   3933             qdict_put_str(options, "file", bdrv_get_node_name(file_bs));
   3934         }
   3935     }
   3936 
   3937     /* Image format probing */
   3938     bs->probed = !drv;
   3939     if (!drv && file) {
   3940         ret = find_image_format(file, filename, &drv, &local_err);
   3941         if (ret < 0) {
   3942             goto fail;
   3943         }
   3944         /*
   3945          * This option update would logically belong in bdrv_fill_options(),
   3946          * but we first need to open bs->file for the probing to work, while
   3947          * opening bs->file already requires the (mostly) final set of options
   3948          * so that cache mode etc. can be inherited.
   3949          *
   3950          * Adding the driver later is somewhat ugly, but it's not an option
   3951          * that would ever be inherited, so it's correct. We just need to make
   3952          * sure to update both bs->options (which has the full effective
   3953          * options for bs) and options (which has file.* already removed).
   3954          */
   3955         qdict_put_str(bs->options, "driver", drv->format_name);
   3956         qdict_put_str(options, "driver", drv->format_name);
   3957     } else if (!drv) {
   3958         error_setg(errp, "Must specify either driver or file");
   3959         goto fail;
   3960     }
   3961 
   3962     /* BDRV_O_PROTOCOL must be set iff a protocol BDS is about to be created */
   3963     assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->bdrv_file_open);
   3964     /* file must be NULL if a protocol BDS is about to be created
   3965      * (the inverse results in an error message from bdrv_open_common()) */
   3966     assert(!(flags & BDRV_O_PROTOCOL) || !file);
   3967 
   3968     /* Open the image */
   3969     ret = bdrv_open_common(bs, file, options, &local_err);
   3970     if (ret < 0) {
   3971         goto fail;
   3972     }
   3973 
   3974     if (file) {
   3975         blk_unref(file);
   3976         file = NULL;
   3977     }
   3978 
   3979     /* If there is a backing file, use it */
   3980     if ((flags & BDRV_O_NO_BACKING) == 0) {
   3981         ret = bdrv_open_backing_file(bs, options, "backing", &local_err);
   3982         if (ret < 0) {
   3983             goto close_and_fail;
   3984         }
   3985     }
   3986 
   3987     /* Remove all children options and references
   3988      * from bs->options and bs->explicit_options */
   3989     QLIST_FOREACH(child, &bs->children, next) {
   3990         char *child_key_dot;
   3991         child_key_dot = g_strdup_printf("%s.", child->name);
   3992         qdict_extract_subqdict(bs->explicit_options, NULL, child_key_dot);
   3993         qdict_extract_subqdict(bs->options, NULL, child_key_dot);
   3994         qdict_del(bs->explicit_options, child->name);
   3995         qdict_del(bs->options, child->name);
   3996         g_free(child_key_dot);
   3997     }
   3998 
   3999     /* Check if any unknown options were used */
   4000     if (qdict_size(options) != 0) {
   4001         const QDictEntry *entry = qdict_first(options);
   4002         if (flags & BDRV_O_PROTOCOL) {
   4003             error_setg(errp, "Block protocol '%s' doesn't support the option "
   4004                        "'%s'", drv->format_name, entry->key);
   4005         } else {
   4006             error_setg(errp,
   4007                        "Block format '%s' does not support the option '%s'",
   4008                        drv->format_name, entry->key);
   4009         }
   4010 
   4011         goto close_and_fail;
   4012     }
   4013 
   4014     bdrv_parent_cb_change_media(bs, true);
   4015 
   4016     qobject_unref(options);
   4017     options = NULL;
   4018 
   4019     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
   4020      * temporary snapshot afterwards. */
   4021     if (snapshot_flags) {
   4022         BlockDriverState *snapshot_bs;
   4023         snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags,
   4024                                                 snapshot_options, &local_err);
   4025         snapshot_options = NULL;
   4026         if (local_err) {
   4027             goto close_and_fail;
   4028         }
   4029         /* We are not going to return bs but the overlay on top of it
   4030          * (snapshot_bs); thus, we have to drop the strong reference to bs
   4031          * (which we obtained by calling bdrv_new()). bs will not be deleted,
   4032          * though, because the overlay still has a reference to it. */
   4033         bdrv_unref(bs);
   4034         bs = snapshot_bs;
   4035     }
   4036 
   4037     return bs;
   4038 
   4039 fail:
   4040     blk_unref(file);
   4041     qobject_unref(snapshot_options);
   4042     qobject_unref(bs->explicit_options);
   4043     qobject_unref(bs->options);
   4044     qobject_unref(options);
   4045     bs->options = NULL;
   4046     bs->explicit_options = NULL;
   4047     bdrv_unref(bs);
   4048     error_propagate(errp, local_err);
   4049     return NULL;
   4050 
   4051 close_and_fail:
   4052     bdrv_unref(bs);
   4053     qobject_unref(snapshot_options);
   4054     qobject_unref(options);
   4055     error_propagate(errp, local_err);
   4056     return NULL;
   4057 }
   4058 
   4059 BlockDriverState *bdrv_open(const char *filename, const char *reference,
   4060                             QDict *options, int flags, Error **errp)
   4061 {
   4062     GLOBAL_STATE_CODE();
   4063 
   4064     return bdrv_open_inherit(filename, reference, options, flags, NULL,
   4065                              NULL, 0, errp);
   4066 }
   4067 
   4068 /* Return true if the NULL-terminated @list contains @str */
   4069 static bool is_str_in_list(const char *str, const char *const *list)
   4070 {
   4071     if (str && list) {
   4072         int i;
   4073         for (i = 0; list[i] != NULL; i++) {
   4074             if (!strcmp(str, list[i])) {
   4075                 return true;
   4076             }
   4077         }
   4078     }
   4079     return false;
   4080 }
   4081 
   4082 /*
   4083  * Check that every option set in @bs->options is also set in
   4084  * @new_opts.
   4085  *
   4086  * Options listed in the common_options list and in
   4087  * @bs->drv->mutable_opts are skipped.
   4088  *
   4089  * Return 0 on success, otherwise return -EINVAL and set @errp.
   4090  */
   4091 static int bdrv_reset_options_allowed(BlockDriverState *bs,
   4092                                       const QDict *new_opts, Error **errp)
   4093 {
   4094     const QDictEntry *e;
   4095     /* These options are common to all block drivers and are handled
   4096      * in bdrv_reopen_prepare() so they can be left out of @new_opts */
   4097     const char *const common_options[] = {
   4098         "node-name", "discard", "cache.direct", "cache.no-flush",
   4099         "read-only", "auto-read-only", "detect-zeroes", NULL
   4100     };
   4101 
   4102     for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) {
   4103         if (!qdict_haskey(new_opts, e->key) &&
   4104             !is_str_in_list(e->key, common_options) &&
   4105             !is_str_in_list(e->key, bs->drv->mutable_opts)) {
   4106             error_setg(errp, "Option '%s' cannot be reset "
   4107                        "to its default value", e->key);
   4108             return -EINVAL;
   4109         }
   4110     }
   4111 
   4112     return 0;
   4113 }
   4114 
   4115 /*
   4116  * Returns true if @child can be reached recursively from @bs
   4117  */
   4118 static bool bdrv_recurse_has_child(BlockDriverState *bs,
   4119                                    BlockDriverState *child)
   4120 {
   4121     BdrvChild *c;
   4122 
   4123     if (bs == child) {
   4124         return true;
   4125     }
   4126 
   4127     QLIST_FOREACH(c, &bs->children, next) {
   4128         if (bdrv_recurse_has_child(c->bs, child)) {
   4129             return true;
   4130         }
   4131     }
   4132 
   4133     return false;
   4134 }
   4135 
   4136 /*
   4137  * Adds a BlockDriverState to a simple queue for an atomic, transactional
   4138  * reopen of multiple devices.
   4139  *
   4140  * bs_queue can either be an existing BlockReopenQueue that has had QTAILQ_INIT
   4141  * already performed, or alternatively may be NULL a new BlockReopenQueue will
   4142  * be created and initialized. This newly created BlockReopenQueue should be
   4143  * passed back in for subsequent calls that are intended to be of the same
   4144  * atomic 'set'.
   4145  *
   4146  * bs is the BlockDriverState to add to the reopen queue.
   4147  *
   4148  * options contains the changed options for the associated bs
   4149  * (the BlockReopenQueue takes ownership)
   4150  *
   4151  * flags contains the open flags for the associated bs
   4152  *
   4153  * returns a pointer to bs_queue, which is either the newly allocated
   4154  * bs_queue, or the existing bs_queue being used.
   4155  *
   4156  * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
   4157  */
   4158 static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
   4159                                                  BlockDriverState *bs,
   4160                                                  QDict *options,
   4161                                                  const BdrvChildClass *klass,
   4162                                                  BdrvChildRole role,
   4163                                                  bool parent_is_format,
   4164                                                  QDict *parent_options,
   4165                                                  int parent_flags,
   4166                                                  bool keep_old_opts)
   4167 {
   4168     assert(bs != NULL);
   4169 
   4170     BlockReopenQueueEntry *bs_entry;
   4171     BdrvChild *child;
   4172     QDict *old_options, *explicit_options, *options_copy;
   4173     int flags;
   4174     QemuOpts *opts;
   4175 
   4176     /* Make sure that the caller remembered to use a drained section. This is
   4177      * important to avoid graph changes between the recursive queuing here and
   4178      * bdrv_reopen_multiple(). */
   4179     assert(bs->quiesce_counter > 0);
   4180     GLOBAL_STATE_CODE();
   4181 
   4182     if (bs_queue == NULL) {
   4183         bs_queue = g_new0(BlockReopenQueue, 1);
   4184         QTAILQ_INIT(bs_queue);
   4185     }
   4186 
   4187     if (!options) {
   4188         options = qdict_new();
   4189     }
   4190 
   4191     /* Check if this BlockDriverState is already in the queue */
   4192     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
   4193         if (bs == bs_entry->state.bs) {
   4194             break;
   4195         }
   4196     }
   4197 
   4198     /*
   4199      * Precedence of options:
   4200      * 1. Explicitly passed in options (highest)
   4201      * 2. Retained from explicitly set options of bs
   4202      * 3. Inherited from parent node
   4203      * 4. Retained from effective options of bs
   4204      */
   4205 
   4206     /* Old explicitly set values (don't overwrite by inherited value) */
   4207     if (bs_entry || keep_old_opts) {
   4208         old_options = qdict_clone_shallow(bs_entry ?
   4209                                           bs_entry->state.explicit_options :
   4210                                           bs->explicit_options);
   4211         bdrv_join_options(bs, options, old_options);
   4212         qobject_unref(old_options);
   4213     }
   4214 
   4215     explicit_options = qdict_clone_shallow(options);
   4216 
   4217     /* Inherit from parent node */
   4218     if (parent_options) {
   4219         flags = 0;
   4220         klass->inherit_options(role, parent_is_format, &flags, options,
   4221                                parent_flags, parent_options);
   4222     } else {
   4223         flags = bdrv_get_flags(bs);
   4224     }
   4225 
   4226     if (keep_old_opts) {
   4227         /* Old values are used for options that aren't set yet */
   4228         old_options = qdict_clone_shallow(bs->options);
   4229         bdrv_join_options(bs, options, old_options);
   4230         qobject_unref(old_options);
   4231     }
   4232 
   4233     /* We have the final set of options so let's update the flags */
   4234     options_copy = qdict_clone_shallow(options);
   4235     opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
   4236     qemu_opts_absorb_qdict(opts, options_copy, NULL);
   4237     update_flags_from_options(&flags, opts);
   4238     qemu_opts_del(opts);
   4239     qobject_unref(options_copy);
   4240 
   4241     /* bdrv_open_inherit() sets and clears some additional flags internally */
   4242     flags &= ~BDRV_O_PROTOCOL;
   4243     if (flags & BDRV_O_RDWR) {
   4244         flags |= BDRV_O_ALLOW_RDWR;
   4245     }
   4246 
   4247     if (!bs_entry) {
   4248         bs_entry = g_new0(BlockReopenQueueEntry, 1);
   4249         QTAILQ_INSERT_TAIL(bs_queue, bs_entry, entry);
   4250     } else {
   4251         qobject_unref(bs_entry->state.options);
   4252         qobject_unref(bs_entry->state.explicit_options);
   4253     }
   4254 
   4255     bs_entry->state.bs = bs;
   4256     bs_entry->state.options = options;
   4257     bs_entry->state.explicit_options = explicit_options;
   4258     bs_entry->state.flags = flags;
   4259 
   4260     /*
   4261      * If keep_old_opts is false then it means that unspecified
   4262      * options must be reset to their original value. We don't allow
   4263      * resetting 'backing' but we need to know if the option is
   4264      * missing in order to decide if we have to return an error.
   4265      */
   4266     if (!keep_old_opts) {
   4267         bs_entry->state.backing_missing =
   4268             !qdict_haskey(options, "backing") &&
   4269             !qdict_haskey(options, "backing.driver");
   4270     }
   4271 
   4272     QLIST_FOREACH(child, &bs->children, next) {
   4273         QDict *new_child_options = NULL;
   4274         bool child_keep_old = keep_old_opts;
   4275 
   4276         /* reopen can only change the options of block devices that were
   4277          * implicitly created and inherited options. For other (referenced)
   4278          * block devices, a syntax like "backing.foo" results in an error. */
   4279         if (child->bs->inherits_from != bs) {
   4280             continue;
   4281         }
   4282 
   4283         /* Check if the options contain a child reference */
   4284         if (qdict_haskey(options, child->name)) {
   4285             const char *childref = qdict_get_try_str(options, child->name);
   4286             /*
   4287              * The current child must not be reopened if the child
   4288              * reference is null or points to a different node.
   4289              */
   4290             if (g_strcmp0(childref, child->bs->node_name)) {
   4291                 continue;
   4292             }
   4293             /*
   4294              * If the child reference points to the current child then
   4295              * reopen it with its existing set of options (note that
   4296              * it can still inherit new options from the parent).
   4297              */
   4298             child_keep_old = true;
   4299         } else {
   4300             /* Extract child options ("child-name.*") */
   4301             char *child_key_dot = g_strdup_printf("%s.", child->name);
   4302             qdict_extract_subqdict(explicit_options, NULL, child_key_dot);
   4303             qdict_extract_subqdict(options, &new_child_options, child_key_dot);
   4304             g_free(child_key_dot);
   4305         }
   4306 
   4307         bdrv_reopen_queue_child(bs_queue, child->bs, new_child_options,
   4308                                 child->klass, child->role, bs->drv->is_format,
   4309                                 options, flags, child_keep_old);
   4310     }
   4311 
   4312     return bs_queue;
   4313 }
   4314 
   4315 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
   4316                                     BlockDriverState *bs,
   4317                                     QDict *options, bool keep_old_opts)
   4318 {
   4319     GLOBAL_STATE_CODE();
   4320 
   4321     return bdrv_reopen_queue_child(bs_queue, bs, options, NULL, 0, false,
   4322                                    NULL, 0, keep_old_opts);
   4323 }
   4324 
   4325 void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue)
   4326 {
   4327     GLOBAL_STATE_CODE();
   4328     if (bs_queue) {
   4329         BlockReopenQueueEntry *bs_entry, *next;
   4330         QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
   4331             qobject_unref(bs_entry->state.explicit_options);
   4332             qobject_unref(bs_entry->state.options);
   4333             g_free(bs_entry);
   4334         }
   4335         g_free(bs_queue);
   4336     }
   4337 }
   4338 
   4339 /*
   4340  * Reopen multiple BlockDriverStates atomically & transactionally.
   4341  *
   4342  * The queue passed in (bs_queue) must have been built up previous
   4343  * via bdrv_reopen_queue().
   4344  *
   4345  * Reopens all BDS specified in the queue, with the appropriate
   4346  * flags.  All devices are prepared for reopen, and failure of any
   4347  * device will cause all device changes to be abandoned, and intermediate
   4348  * data cleaned up.
   4349  *
   4350  * If all devices prepare successfully, then the changes are committed
   4351  * to all devices.
   4352  *
   4353  * All affected nodes must be drained between bdrv_reopen_queue() and
   4354  * bdrv_reopen_multiple().
   4355  *
   4356  * To be called from the main thread, with all other AioContexts unlocked.
   4357  */
   4358 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
   4359 {
   4360     int ret = -1;
   4361     BlockReopenQueueEntry *bs_entry, *next;
   4362     AioContext *ctx;
   4363     Transaction *tran = tran_new();
   4364     g_autoptr(GHashTable) found = NULL;
   4365     g_autoptr(GSList) refresh_list = NULL;
   4366 
   4367     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
   4368     assert(bs_queue != NULL);
   4369     GLOBAL_STATE_CODE();
   4370 
   4371     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
   4372         ctx = bdrv_get_aio_context(bs_entry->state.bs);
   4373         aio_context_acquire(ctx);
   4374         ret = bdrv_flush(bs_entry->state.bs);
   4375         aio_context_release(ctx);
   4376         if (ret < 0) {
   4377             error_setg_errno(errp, -ret, "Error flushing drive");
   4378             goto abort;
   4379         }
   4380     }
   4381 
   4382     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
   4383         assert(bs_entry->state.bs->quiesce_counter > 0);
   4384         ctx = bdrv_get_aio_context(bs_entry->state.bs);
   4385         aio_context_acquire(ctx);
   4386         ret = bdrv_reopen_prepare(&bs_entry->state, bs_queue, tran, errp);
   4387         aio_context_release(ctx);
   4388         if (ret < 0) {
   4389             goto abort;
   4390         }
   4391         bs_entry->prepared = true;
   4392     }
   4393 
   4394     found = g_hash_table_new(NULL, NULL);
   4395     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
   4396         BDRVReopenState *state = &bs_entry->state;
   4397 
   4398         refresh_list = bdrv_topological_dfs(refresh_list, found, state->bs);
   4399         if (state->old_backing_bs) {
   4400             refresh_list = bdrv_topological_dfs(refresh_list, found,
   4401                                                 state->old_backing_bs);
   4402         }
   4403         if (state->old_file_bs) {
   4404             refresh_list = bdrv_topological_dfs(refresh_list, found,
   4405                                                 state->old_file_bs);
   4406         }
   4407     }
   4408 
   4409     /*
   4410      * Note that file-posix driver rely on permission update done during reopen
   4411      * (even if no permission changed), because it wants "new" permissions for
   4412      * reconfiguring the fd and that's why it does it in raw_check_perm(), not
   4413      * in raw_reopen_prepare() which is called with "old" permissions.
   4414      */
   4415     ret = bdrv_list_refresh_perms(refresh_list, bs_queue, tran, errp);
   4416     if (ret < 0) {
   4417         goto abort;
   4418     }
   4419 
   4420     /*
   4421      * If we reach this point, we have success and just need to apply the
   4422      * changes.
   4423      *
   4424      * Reverse order is used to comfort qcow2 driver: on commit it need to write
   4425      * IN_USE flag to the image, to mark bitmaps in the image as invalid. But
   4426      * children are usually goes after parents in reopen-queue, so go from last
   4427      * to first element.
   4428      */
   4429     QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
   4430         ctx = bdrv_get_aio_context(bs_entry->state.bs);
   4431         aio_context_acquire(ctx);
   4432         bdrv_reopen_commit(&bs_entry->state);
   4433         aio_context_release(ctx);
   4434     }
   4435 
   4436     tran_commit(tran);
   4437 
   4438     QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
   4439         BlockDriverState *bs = bs_entry->state.bs;
   4440 
   4441         if (bs->drv->bdrv_reopen_commit_post) {
   4442             ctx = bdrv_get_aio_context(bs);
   4443             aio_context_acquire(ctx);
   4444             bs->drv->bdrv_reopen_commit_post(&bs_entry->state);
   4445             aio_context_release(ctx);
   4446         }
   4447     }
   4448 
   4449     ret = 0;
   4450     goto cleanup;
   4451 
   4452 abort:
   4453     tran_abort(tran);
   4454     QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
   4455         if (bs_entry->prepared) {
   4456             ctx = bdrv_get_aio_context(bs_entry->state.bs);
   4457             aio_context_acquire(ctx);
   4458             bdrv_reopen_abort(&bs_entry->state);
   4459             aio_context_release(ctx);
   4460         }
   4461     }
   4462 
   4463 cleanup:
   4464     bdrv_reopen_queue_free(bs_queue);
   4465 
   4466     return ret;
   4467 }
   4468 
   4469 int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts,
   4470                 Error **errp)
   4471 {
   4472     AioContext *ctx = bdrv_get_aio_context(bs);
   4473     BlockReopenQueue *queue;
   4474     int ret;
   4475 
   4476     GLOBAL_STATE_CODE();
   4477 
   4478     bdrv_subtree_drained_begin(bs);
   4479     if (ctx != qemu_get_aio_context()) {
   4480         aio_context_release(ctx);
   4481     }
   4482 
   4483     queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts);
   4484     ret = bdrv_reopen_multiple(queue, errp);
   4485 
   4486     if (ctx != qemu_get_aio_context()) {
   4487         aio_context_acquire(ctx);
   4488     }
   4489     bdrv_subtree_drained_end(bs);
   4490 
   4491     return ret;
   4492 }
   4493 
   4494 int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
   4495                               Error **errp)
   4496 {
   4497     QDict *opts = qdict_new();
   4498 
   4499     GLOBAL_STATE_CODE();
   4500 
   4501     qdict_put_bool(opts, BDRV_OPT_READ_ONLY, read_only);
   4502 
   4503     return bdrv_reopen(bs, opts, true, errp);
   4504 }
   4505 
   4506 /*
   4507  * Take a BDRVReopenState and check if the value of 'backing' in the
   4508  * reopen_state->options QDict is valid or not.
   4509  *
   4510  * If 'backing' is missing from the QDict then return 0.
   4511  *
   4512  * If 'backing' contains the node name of the backing file of
   4513  * reopen_state->bs then return 0.
   4514  *
   4515  * If 'backing' contains a different node name (or is null) then check
   4516  * whether the current backing file can be replaced with the new one.
   4517  * If that's the case then reopen_state->replace_backing_bs is set to
   4518  * true and reopen_state->new_backing_bs contains a pointer to the new
   4519  * backing BlockDriverState (or NULL).
   4520  *
   4521  * Return 0 on success, otherwise return < 0 and set @errp.
   4522  */
   4523 static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
   4524                                              bool is_backing, Transaction *tran,
   4525                                              Error **errp)
   4526 {
   4527     BlockDriverState *bs = reopen_state->bs;
   4528     BlockDriverState *new_child_bs;
   4529     BlockDriverState *old_child_bs = is_backing ? child_bs(bs->backing) :
   4530                                                   child_bs(bs->file);
   4531     const char *child_name = is_backing ? "backing" : "file";
   4532     QObject *value;
   4533     const char *str;
   4534 
   4535     GLOBAL_STATE_CODE();
   4536 
   4537     value = qdict_get(reopen_state->options, child_name);
   4538     if (value == NULL) {
   4539         return 0;
   4540     }
   4541 
   4542     switch (qobject_type(value)) {
   4543     case QTYPE_QNULL:
   4544         assert(is_backing); /* The 'file' option does not allow a null value */
   4545         new_child_bs = NULL;
   4546         break;
   4547     case QTYPE_QSTRING:
   4548         str = qstring_get_str(qobject_to(QString, value));
   4549         new_child_bs = bdrv_lookup_bs(NULL, str, errp);
   4550         if (new_child_bs == NULL) {
   4551             return -EINVAL;
   4552         } else if (bdrv_recurse_has_child(new_child_bs, bs)) {
   4553             error_setg(errp, "Making '%s' a %s child of '%s' would create a "
   4554                        "cycle", str, child_name, bs->node_name);
   4555             return -EINVAL;
   4556         }
   4557         break;
   4558     default:
   4559         /*
   4560          * The options QDict has been flattened, so 'backing' and 'file'
   4561          * do not allow any other data type here.
   4562          */
   4563         g_assert_not_reached();
   4564     }
   4565 
   4566     if (old_child_bs == new_child_bs) {
   4567         return 0;
   4568     }
   4569 
   4570     if (old_child_bs) {
   4571         if (bdrv_skip_implicit_filters(old_child_bs) == new_child_bs) {
   4572             return 0;
   4573         }
   4574 
   4575         if (old_child_bs->implicit) {
   4576             error_setg(errp, "Cannot replace implicit %s child of %s",
   4577                        child_name, bs->node_name);
   4578             return -EPERM;
   4579         }
   4580     }
   4581 
   4582     if (bs->drv->is_filter && !old_child_bs) {
   4583         /*
   4584          * Filters always have a file or a backing child, so we are trying to
   4585          * change wrong child
   4586          */
   4587         error_setg(errp, "'%s' is a %s filter node that does not support a "
   4588                    "%s child", bs->node_name, bs->drv->format_name, child_name);
   4589         return -EINVAL;
   4590     }
   4591 
   4592     if (is_backing) {
   4593         reopen_state->old_backing_bs = old_child_bs;
   4594     } else {
   4595         reopen_state->old_file_bs = old_child_bs;
   4596     }
   4597 
   4598     return bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
   4599                                            tran, errp);
   4600 }
   4601 
   4602 /*
   4603  * Prepares a BlockDriverState for reopen. All changes are staged in the
   4604  * 'opaque' field of the BDRVReopenState, which is used and allocated by
   4605  * the block driver layer .bdrv_reopen_prepare()
   4606  *
   4607  * bs is the BlockDriverState to reopen
   4608  * flags are the new open flags
   4609  * queue is the reopen queue
   4610  *
   4611  * Returns 0 on success, non-zero on error.  On error errp will be set
   4612  * as well.
   4613  *
   4614  * On failure, bdrv_reopen_abort() will be called to clean up any data.
   4615  * It is the responsibility of the caller to then call the abort() or
   4616  * commit() for any other BDS that have been left in a prepare() state
   4617  *
   4618  */
   4619 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
   4620                                BlockReopenQueue *queue,
   4621                                Transaction *change_child_tran, Error **errp)
   4622 {
   4623     int ret = -1;
   4624     int old_flags;
   4625     Error *local_err = NULL;
   4626     BlockDriver *drv;
   4627     QemuOpts *opts;
   4628     QDict *orig_reopen_opts;
   4629     char *discard = NULL;
   4630     bool read_only;
   4631     bool drv_prepared = false;
   4632 
   4633     assert(reopen_state != NULL);
   4634     assert(reopen_state->bs->drv != NULL);
   4635     GLOBAL_STATE_CODE();
   4636     drv = reopen_state->bs->drv;
   4637 
   4638     /* This function and each driver's bdrv_reopen_prepare() remove
   4639      * entries from reopen_state->options as they are processed, so
   4640      * we need to make a copy of the original QDict. */
   4641     orig_reopen_opts = qdict_clone_shallow(reopen_state->options);
   4642 
   4643     /* Process generic block layer options */
   4644     opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
   4645     if (!qemu_opts_absorb_qdict(opts, reopen_state->options, errp)) {
   4646         ret = -EINVAL;
   4647         goto error;
   4648     }
   4649 
   4650     /* This was already called in bdrv_reopen_queue_child() so the flags
   4651      * are up-to-date. This time we simply want to remove the options from
   4652      * QemuOpts in order to indicate that they have been processed. */
   4653     old_flags = reopen_state->flags;
   4654     update_flags_from_options(&reopen_state->flags, opts);
   4655     assert(old_flags == reopen_state->flags);
   4656 
   4657     discard = qemu_opt_get_del(opts, BDRV_OPT_DISCARD);
   4658     if (discard != NULL) {
   4659         if (bdrv_parse_discard_flags(discard, &reopen_state->flags) != 0) {
   4660             error_setg(errp, "Invalid discard option");
   4661             ret = -EINVAL;
   4662             goto error;
   4663         }
   4664     }
   4665 
   4666     reopen_state->detect_zeroes =
   4667         bdrv_parse_detect_zeroes(opts, reopen_state->flags, &local_err);
   4668     if (local_err) {
   4669         error_propagate(errp, local_err);
   4670         ret = -EINVAL;
   4671         goto error;
   4672     }
   4673 
   4674     /* All other options (including node-name and driver) must be unchanged.
   4675      * Put them back into the QDict, so that they are checked at the end
   4676      * of this function. */
   4677     qemu_opts_to_qdict(opts, reopen_state->options);
   4678 
   4679     /* If we are to stay read-only, do not allow permission change
   4680      * to r/w. Attempting to set to r/w may fail if either BDRV_O_ALLOW_RDWR is
   4681      * not set, or if the BDS still has copy_on_read enabled */
   4682     read_only = !(reopen_state->flags & BDRV_O_RDWR);
   4683     ret = bdrv_can_set_read_only(reopen_state->bs, read_only, true, &local_err);
   4684     if (local_err) {
   4685         error_propagate(errp, local_err);
   4686         goto error;
   4687     }
   4688 
   4689     if (drv->bdrv_reopen_prepare) {
   4690         /*
   4691          * If a driver-specific option is missing, it means that we
   4692          * should reset it to its default value.
   4693          * But not all options allow that, so we need to check it first.
   4694          */
   4695         ret = bdrv_reset_options_allowed(reopen_state->bs,
   4696                                          reopen_state->options, errp);
   4697         if (ret) {
   4698             goto error;
   4699         }
   4700 
   4701         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
   4702         if (ret) {
   4703             if (local_err != NULL) {
   4704                 error_propagate(errp, local_err);
   4705             } else {
   4706                 bdrv_refresh_filename(reopen_state->bs);
   4707                 error_setg(errp, "failed while preparing to reopen image '%s'",
   4708                            reopen_state->bs->filename);
   4709             }
   4710             goto error;
   4711         }
   4712     } else {
   4713         /* It is currently mandatory to have a bdrv_reopen_prepare()
   4714          * handler for each supported drv. */
   4715         error_setg(errp, "Block format '%s' used by node '%s' "
   4716                    "does not support reopening files", drv->format_name,
   4717                    bdrv_get_device_or_node_name(reopen_state->bs));
   4718         ret = -1;
   4719         goto error;
   4720     }
   4721 
   4722     drv_prepared = true;
   4723 
   4724     /*
   4725      * We must provide the 'backing' option if the BDS has a backing
   4726      * file or if the image file has a backing file name as part of
   4727      * its metadata. Otherwise the 'backing' option can be omitted.
   4728      */
   4729     if (drv->supports_backing && reopen_state->backing_missing &&
   4730         (reopen_state->bs->backing || reopen_state->bs->backing_file[0])) {
   4731         error_setg(errp, "backing is missing for '%s'",
   4732                    reopen_state->bs->node_name);
   4733         ret = -EINVAL;
   4734         goto error;
   4735     }
   4736 
   4737     /*
   4738      * Allow changing the 'backing' option. The new value can be
   4739      * either a reference to an existing node (using its node name)
   4740      * or NULL to simply detach the current backing file.
   4741      */
   4742     ret = bdrv_reopen_parse_file_or_backing(reopen_state, true,
   4743                                             change_child_tran, errp);
   4744     if (ret < 0) {
   4745         goto error;
   4746     }
   4747     qdict_del(reopen_state->options, "backing");
   4748 
   4749     /* Allow changing the 'file' option. In this case NULL is not allowed */
   4750     ret = bdrv_reopen_parse_file_or_backing(reopen_state, false,
   4751                                             change_child_tran, errp);
   4752     if (ret < 0) {
   4753         goto error;
   4754     }
   4755     qdict_del(reopen_state->options, "file");
   4756 
   4757     /* Options that are not handled are only okay if they are unchanged
   4758      * compared to the old state. It is expected that some options are only
   4759      * used for the initial open, but not reopen (e.g. filename) */
   4760     if (qdict_size(reopen_state->options)) {
   4761         const QDictEntry *entry = qdict_first(reopen_state->options);
   4762 
   4763         do {
   4764             QObject *new = entry->value;
   4765             QObject *old = qdict_get(reopen_state->bs->options, entry->key);
   4766 
   4767             /* Allow child references (child_name=node_name) as long as they
   4768              * point to the current child (i.e. everything stays the same). */
   4769             if (qobject_type(new) == QTYPE_QSTRING) {
   4770                 BdrvChild *child;
   4771                 QLIST_FOREACH(child, &reopen_state->bs->children, next) {
   4772                     if (!strcmp(child->name, entry->key)) {
   4773                         break;
   4774                     }
   4775                 }
   4776 
   4777                 if (child) {
   4778                     if (!strcmp(child->bs->node_name,
   4779                                 qstring_get_str(qobject_to(QString, new)))) {
   4780                         continue; /* Found child with this name, skip option */
   4781                     }
   4782                 }
   4783             }
   4784 
   4785             /*
   4786              * TODO: When using -drive to specify blockdev options, all values
   4787              * will be strings; however, when using -blockdev, blockdev-add or
   4788              * filenames using the json:{} pseudo-protocol, they will be
   4789              * correctly typed.
   4790              * In contrast, reopening options are (currently) always strings
   4791              * (because you can only specify them through qemu-io; all other
   4792              * callers do not specify any options).
   4793              * Therefore, when using anything other than -drive to create a BDS,
   4794              * this cannot detect non-string options as unchanged, because
   4795              * qobject_is_equal() always returns false for objects of different
   4796              * type.  In the future, this should be remedied by correctly typing
   4797              * all options.  For now, this is not too big of an issue because
   4798              * the user can simply omit options which cannot be changed anyway,
   4799              * so they will stay unchanged.
   4800              */
   4801             if (!qobject_is_equal(new, old)) {
   4802                 error_setg(errp, "Cannot change the option '%s'", entry->key);
   4803                 ret = -EINVAL;
   4804                 goto error;
   4805             }
   4806         } while ((entry = qdict_next(reopen_state->options, entry)));
   4807     }
   4808 
   4809     ret = 0;
   4810 
   4811     /* Restore the original reopen_state->options QDict */
   4812     qobject_unref(reopen_state->options);
   4813     reopen_state->options = qobject_ref(orig_reopen_opts);
   4814 
   4815 error:
   4816     if (ret < 0 && drv_prepared) {
   4817         /* drv->bdrv_reopen_prepare() has succeeded, so we need to
   4818          * call drv->bdrv_reopen_abort() before signaling an error
   4819          * (bdrv_reopen_multiple() will not call bdrv_reopen_abort()
   4820          * when the respective bdrv_reopen_prepare() has failed) */
   4821         if (drv->bdrv_reopen_abort) {
   4822             drv->bdrv_reopen_abort(reopen_state);
   4823         }
   4824     }
   4825     qemu_opts_del(opts);
   4826     qobject_unref(orig_reopen_opts);
   4827     g_free(discard);
   4828     return ret;
   4829 }
   4830 
   4831 /*
   4832  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
   4833  * makes them final by swapping the staging BlockDriverState contents into
   4834  * the active BlockDriverState contents.
   4835  */
   4836 static void bdrv_reopen_commit(BDRVReopenState *reopen_state)
   4837 {
   4838     BlockDriver *drv;
   4839     BlockDriverState *bs;
   4840     BdrvChild *child;
   4841 
   4842     assert(reopen_state != NULL);
   4843     bs = reopen_state->bs;
   4844     drv = bs->drv;
   4845     assert(drv != NULL);
   4846     GLOBAL_STATE_CODE();
   4847 
   4848     /* If there are any driver level actions to take */
   4849     if (drv->bdrv_reopen_commit) {
   4850         drv->bdrv_reopen_commit(reopen_state);
   4851     }
   4852 
   4853     /* set BDS specific flags now */
   4854     qobject_unref(bs->explicit_options);
   4855     qobject_unref(bs->options);
   4856     qobject_ref(reopen_state->explicit_options);
   4857     qobject_ref(reopen_state->options);
   4858 
   4859     bs->explicit_options   = reopen_state->explicit_options;
   4860     bs->options            = reopen_state->options;
   4861     bs->open_flags         = reopen_state->flags;
   4862     bs->detect_zeroes      = reopen_state->detect_zeroes;
   4863 
   4864     /* Remove child references from bs->options and bs->explicit_options.
   4865      * Child options were already removed in bdrv_reopen_queue_child() */
   4866     QLIST_FOREACH(child, &bs->children, next) {
   4867         qdict_del(bs->explicit_options, child->name);
   4868         qdict_del(bs->options, child->name);
   4869     }
   4870     /* backing is probably removed, so it's not handled by previous loop */
   4871     qdict_del(bs->explicit_options, "backing");
   4872     qdict_del(bs->options, "backing");
   4873 
   4874     bdrv_refresh_limits(bs, NULL, NULL);
   4875 }
   4876 
   4877 /*
   4878  * Abort the reopen, and delete and free the staged changes in
   4879  * reopen_state
   4880  */
   4881 static void bdrv_reopen_abort(BDRVReopenState *reopen_state)
   4882 {
   4883     BlockDriver *drv;
   4884 
   4885     assert(reopen_state != NULL);
   4886     drv = reopen_state->bs->drv;
   4887     assert(drv != NULL);
   4888     GLOBAL_STATE_CODE();
   4889 
   4890     if (drv->bdrv_reopen_abort) {
   4891         drv->bdrv_reopen_abort(reopen_state);
   4892     }
   4893 }
   4894 
   4895 
   4896 static void bdrv_close(BlockDriverState *bs)
   4897 {
   4898     BdrvAioNotifier *ban, *ban_next;
   4899     BdrvChild *child, *next;
   4900 
   4901     GLOBAL_STATE_CODE();
   4902     assert(!bs->refcnt);
   4903 
   4904     bdrv_drained_begin(bs); /* complete I/O */
   4905     bdrv_flush(bs);
   4906     bdrv_drain(bs); /* in case flush left pending I/O */
   4907 
   4908     if (bs->drv) {
   4909         if (bs->drv->bdrv_close) {
   4910             /* Must unfreeze all children, so bdrv_unref_child() works */
   4911             bs->drv->bdrv_close(bs);
   4912         }
   4913         bs->drv = NULL;
   4914     }
   4915 
   4916     QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
   4917         bdrv_unref_child(bs, child);
   4918     }
   4919 
   4920     assert(!bs->backing);
   4921     assert(!bs->file);
   4922     g_free(bs->opaque);
   4923     bs->opaque = NULL;
   4924     qatomic_set(&bs->copy_on_read, 0);
   4925     bs->backing_file[0] = '\0';
   4926     bs->backing_format[0] = '\0';
   4927     bs->total_sectors = 0;
   4928     bs->encrypted = false;
   4929     bs->sg = false;
   4930     qobject_unref(bs->options);
   4931     qobject_unref(bs->explicit_options);
   4932     bs->options = NULL;
   4933     bs->explicit_options = NULL;
   4934     qobject_unref(bs->full_open_options);
   4935     bs->full_open_options = NULL;
   4936     g_free(bs->block_status_cache);
   4937     bs->block_status_cache = NULL;
   4938 
   4939     bdrv_release_named_dirty_bitmaps(bs);
   4940     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
   4941 
   4942     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
   4943         g_free(ban);
   4944     }
   4945     QLIST_INIT(&bs->aio_notifiers);
   4946     bdrv_drained_end(bs);
   4947 
   4948     /*
   4949      * If we're still inside some bdrv_drain_all_begin()/end() sections, end
   4950      * them now since this BDS won't exist anymore when bdrv_drain_all_end()
   4951      * gets called.
   4952      */
   4953     if (bs->quiesce_counter) {
   4954         bdrv_drain_all_end_quiesce(bs);
   4955     }
   4956 }
   4957 
   4958 void bdrv_close_all(void)
   4959 {
   4960     GLOBAL_STATE_CODE();
   4961     assert(job_next(NULL) == NULL);
   4962 
   4963     /* Drop references from requests still in flight, such as canceled block
   4964      * jobs whose AIO context has not been polled yet */
   4965     bdrv_drain_all();
   4966 
   4967     blk_remove_all_bs();
   4968     blockdev_close_all_bdrv_states();
   4969 
   4970     assert(QTAILQ_EMPTY(&all_bdrv_states));
   4971 }
   4972 
   4973 static bool should_update_child(BdrvChild *c, BlockDriverState *to)
   4974 {
   4975     GQueue *queue;
   4976     GHashTable *found;
   4977     bool ret;
   4978 
   4979     if (c->klass->stay_at_node) {
   4980         return false;
   4981     }
   4982 
   4983     /* If the child @c belongs to the BDS @to, replacing the current
   4984      * c->bs by @to would mean to create a loop.
   4985      *
   4986      * Such a case occurs when appending a BDS to a backing chain.
   4987      * For instance, imagine the following chain:
   4988      *
   4989      *   guest device -> node A -> further backing chain...
   4990      *
   4991      * Now we create a new BDS B which we want to put on top of this
   4992      * chain, so we first attach A as its backing node:
   4993      *
   4994      *                   node B
   4995      *                     |
   4996      *                     v
   4997      *   guest device -> node A -> further backing chain...
   4998      *
   4999      * Finally we want to replace A by B.  When doing that, we want to
   5000      * replace all pointers to A by pointers to B -- except for the
   5001      * pointer from B because (1) that would create a loop, and (2)
   5002      * that pointer should simply stay intact:
   5003      *
   5004      *   guest device -> node B
   5005      *                     |
   5006      *                     v
   5007      *                   node A -> further backing chain...
   5008      *
   5009      * In general, when replacing a node A (c->bs) by a node B (@to),
   5010      * if A is a child of B, that means we cannot replace A by B there
   5011      * because that would create a loop.  Silently detaching A from B
   5012      * is also not really an option.  So overall just leaving A in
   5013      * place there is the most sensible choice.
   5014      *
   5015      * We would also create a loop in any cases where @c is only
   5016      * indirectly referenced by @to. Prevent this by returning false
   5017      * if @c is found (by breadth-first search) anywhere in the whole
   5018      * subtree of @to.
   5019      */
   5020 
   5021     ret = true;
   5022     found = g_hash_table_new(NULL, NULL);
   5023     g_hash_table_add(found, to);
   5024     queue = g_queue_new();
   5025     g_queue_push_tail(queue, to);
   5026 
   5027     while (!g_queue_is_empty(queue)) {
   5028         BlockDriverState *v = g_queue_pop_head(queue);
   5029         BdrvChild *c2;
   5030 
   5031         QLIST_FOREACH(c2, &v->children, next) {
   5032             if (c2 == c) {
   5033                 ret = false;
   5034                 break;
   5035             }
   5036 
   5037             if (g_hash_table_contains(found, c2->bs)) {
   5038                 continue;
   5039             }
   5040 
   5041             g_queue_push_tail(queue, c2->bs);
   5042             g_hash_table_add(found, c2->bs);
   5043         }
   5044     }
   5045 
   5046     g_queue_free(queue);
   5047     g_hash_table_destroy(found);
   5048 
   5049     return ret;
   5050 }
   5051 
   5052 static void bdrv_remove_child_commit(void *opaque)
   5053 {
   5054     GLOBAL_STATE_CODE();
   5055     bdrv_child_free(opaque);
   5056 }
   5057 
   5058 static TransactionActionDrv bdrv_remove_child_drv = {
   5059     .commit = bdrv_remove_child_commit,
   5060 };
   5061 
   5062 /* Function doesn't update permissions, caller is responsible for this. */
   5063 static void bdrv_remove_child(BdrvChild *child, Transaction *tran)
   5064 {
   5065     if (!child) {
   5066         return;
   5067     }
   5068 
   5069     if (child->bs) {
   5070         bdrv_replace_child_tran(child, NULL, tran);
   5071     }
   5072 
   5073     tran_add(tran, &bdrv_remove_child_drv, child);
   5074 }
   5075 
   5076 /*
   5077  * A function to remove backing-chain child of @bs if exists: cow child for
   5078  * format nodes (always .backing) and filter child for filters (may be .file or
   5079  * .backing)
   5080  */
   5081 static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
   5082                                             Transaction *tran)
   5083 {
   5084     bdrv_remove_child(bdrv_filter_or_cow_child(bs), tran);
   5085 }
   5086 
   5087 static int bdrv_replace_node_noperm(BlockDriverState *from,
   5088                                     BlockDriverState *to,
   5089                                     bool auto_skip, Transaction *tran,
   5090                                     Error **errp)
   5091 {
   5092     BdrvChild *c, *next;
   5093 
   5094     GLOBAL_STATE_CODE();
   5095 
   5096     QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
   5097         assert(c->bs == from);
   5098         if (!should_update_child(c, to)) {
   5099             if (auto_skip) {
   5100                 continue;
   5101             }
   5102             error_setg(errp, "Should not change '%s' link to '%s'",
   5103                        c->name, from->node_name);
   5104             return -EINVAL;
   5105         }
   5106         if (c->frozen) {
   5107             error_setg(errp, "Cannot change '%s' link to '%s'",
   5108                        c->name, from->node_name);
   5109             return -EPERM;
   5110         }
   5111         bdrv_replace_child_tran(c, to, tran);
   5112     }
   5113 
   5114     return 0;
   5115 }
   5116 
   5117 /*
   5118  * With auto_skip=true bdrv_replace_node_common skips updating from parents
   5119  * if it creates a parent-child relation loop or if parent is block-job.
   5120  *
   5121  * With auto_skip=false the error is returned if from has a parent which should
   5122  * not be updated.
   5123  *
   5124  * With @detach_subchain=true @to must be in a backing chain of @from. In this
   5125  * case backing link of the cow-parent of @to is removed.
   5126  */
   5127 static int bdrv_replace_node_common(BlockDriverState *from,
   5128                                     BlockDriverState *to,
   5129                                     bool auto_skip, bool detach_subchain,
   5130                                     Error **errp)
   5131 {
   5132     Transaction *tran = tran_new();
   5133     g_autoptr(GHashTable) found = NULL;
   5134     g_autoptr(GSList) refresh_list = NULL;
   5135     BlockDriverState *to_cow_parent = NULL;
   5136     int ret;
   5137 
   5138     GLOBAL_STATE_CODE();
   5139 
   5140     if (detach_subchain) {
   5141         assert(bdrv_chain_contains(from, to));
   5142         assert(from != to);
   5143         for (to_cow_parent = from;
   5144              bdrv_filter_or_cow_bs(to_cow_parent) != to;
   5145              to_cow_parent = bdrv_filter_or_cow_bs(to_cow_parent))
   5146         {
   5147             ;
   5148         }
   5149     }
   5150 
   5151     /* Make sure that @from doesn't go away until we have successfully attached
   5152      * all of its parents to @to. */
   5153     bdrv_ref(from);
   5154 
   5155     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
   5156     assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
   5157     bdrv_drained_begin(from);
   5158 
   5159     /*
   5160      * Do the replacement without permission update.
   5161      * Replacement may influence the permissions, we should calculate new
   5162      * permissions based on new graph. If we fail, we'll roll-back the
   5163      * replacement.
   5164      */
   5165     ret = bdrv_replace_node_noperm(from, to, auto_skip, tran, errp);
   5166     if (ret < 0) {
   5167         goto out;
   5168     }
   5169 
   5170     if (detach_subchain) {
   5171         bdrv_remove_filter_or_cow_child(to_cow_parent, tran);
   5172     }
   5173 
   5174     found = g_hash_table_new(NULL, NULL);
   5175 
   5176     refresh_list = bdrv_topological_dfs(refresh_list, found, to);
   5177     refresh_list = bdrv_topological_dfs(refresh_list, found, from);
   5178 
   5179     ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
   5180     if (ret < 0) {
   5181         goto out;
   5182     }
   5183 
   5184     ret = 0;
   5185 
   5186 out:
   5187     tran_finalize(tran, ret);
   5188 
   5189     bdrv_drained_end(from);
   5190     bdrv_unref(from);
   5191 
   5192     return ret;
   5193 }
   5194 
   5195 int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
   5196                       Error **errp)
   5197 {
   5198     GLOBAL_STATE_CODE();
   5199 
   5200     return bdrv_replace_node_common(from, to, true, false, errp);
   5201 }
   5202 
   5203 int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
   5204 {
   5205     GLOBAL_STATE_CODE();
   5206 
   5207     return bdrv_replace_node_common(bs, bdrv_filter_or_cow_bs(bs), true, true,
   5208                                     errp);
   5209 }
   5210 
   5211 /*
   5212  * Add new bs contents at the top of an image chain while the chain is
   5213  * live, while keeping required fields on the top layer.
   5214  *
   5215  * This will modify the BlockDriverState fields, and swap contents
   5216  * between bs_new and bs_top. Both bs_new and bs_top are modified.
   5217  *
   5218  * bs_new must not be attached to a BlockBackend and must not have backing
   5219  * child.
   5220  *
   5221  * This function does not create any image files.
   5222  */
   5223 int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
   5224                 Error **errp)
   5225 {
   5226     int ret;
   5227     BdrvChild *child;
   5228     Transaction *tran = tran_new();
   5229 
   5230     GLOBAL_STATE_CODE();
   5231 
   5232     assert(!bs_new->backing);
   5233 
   5234     child = bdrv_attach_child_noperm(bs_new, bs_top, "backing",
   5235                                      &child_of_bds, bdrv_backing_role(bs_new),
   5236                                      tran, errp);
   5237     if (!child) {
   5238         ret = -EINVAL;
   5239         goto out;
   5240     }
   5241 
   5242     ret = bdrv_replace_node_noperm(bs_top, bs_new, true, tran, errp);
   5243     if (ret < 0) {
   5244         goto out;
   5245     }
   5246 
   5247     ret = bdrv_refresh_perms(bs_new, errp);
   5248 out:
   5249     tran_finalize(tran, ret);
   5250 
   5251     bdrv_refresh_limits(bs_top, NULL, NULL);
   5252 
   5253     return ret;
   5254 }
   5255 
   5256 /* Not for empty child */
   5257 int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
   5258                           Error **errp)
   5259 {
   5260     int ret;
   5261     Transaction *tran = tran_new();
   5262     g_autoptr(GHashTable) found = NULL;
   5263     g_autoptr(GSList) refresh_list = NULL;
   5264     BlockDriverState *old_bs = child->bs;
   5265 
   5266     GLOBAL_STATE_CODE();
   5267 
   5268     bdrv_ref(old_bs);
   5269     bdrv_drained_begin(old_bs);
   5270     bdrv_drained_begin(new_bs);
   5271 
   5272     bdrv_replace_child_tran(child, new_bs, tran);
   5273 
   5274     found = g_hash_table_new(NULL, NULL);
   5275     refresh_list = bdrv_topological_dfs(refresh_list, found, old_bs);
   5276     refresh_list = bdrv_topological_dfs(refresh_list, found, new_bs);
   5277 
   5278     ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
   5279 
   5280     tran_finalize(tran, ret);
   5281 
   5282     bdrv_drained_end(old_bs);
   5283     bdrv_drained_end(new_bs);
   5284     bdrv_unref(old_bs);
   5285 
   5286     return ret;
   5287 }
   5288 
   5289 static void bdrv_delete(BlockDriverState *bs)
   5290 {
   5291     assert(bdrv_op_blocker_is_empty(bs));
   5292     assert(!bs->refcnt);
   5293     GLOBAL_STATE_CODE();
   5294 
   5295     /* remove from list, if necessary */
   5296     if (bs->node_name[0] != '\0') {
   5297         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
   5298     }
   5299     QTAILQ_REMOVE(&all_bdrv_states, bs, bs_list);
   5300 
   5301     bdrv_close(bs);
   5302 
   5303     g_free(bs);
   5304 }
   5305 
   5306 
   5307 /*
   5308  * Replace @bs by newly created block node.
   5309  *
   5310  * @options is a QDict of options to pass to the block drivers, or NULL for an
   5311  * empty set of options. The reference to the QDict belongs to the block layer
   5312  * after the call (even on failure), so if the caller intends to reuse the
   5313  * dictionary, it needs to use qobject_ref() before calling bdrv_open.
   5314  */
   5315 BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
   5316                                    int flags, Error **errp)
   5317 {
   5318     ERRP_GUARD();
   5319     int ret;
   5320     BlockDriverState *new_node_bs = NULL;
   5321     const char *drvname, *node_name;
   5322     BlockDriver *drv;
   5323 
   5324     drvname = qdict_get_try_str(options, "driver");
   5325     if (!drvname) {
   5326         error_setg(errp, "driver is not specified");
   5327         goto fail;
   5328     }
   5329 
   5330     drv = bdrv_find_format(drvname);
   5331     if (!drv) {
   5332         error_setg(errp, "Unknown driver: '%s'", drvname);
   5333         goto fail;
   5334     }
   5335 
   5336     node_name = qdict_get_try_str(options, "node-name");
   5337 
   5338     GLOBAL_STATE_CODE();
   5339 
   5340     new_node_bs = bdrv_new_open_driver_opts(drv, node_name, options, flags,
   5341                                             errp);
   5342     options = NULL; /* bdrv_new_open_driver() eats options */
   5343     if (!new_node_bs) {
   5344         error_prepend(errp, "Could not create node: ");
   5345         goto fail;
   5346     }
   5347 
   5348     bdrv_drained_begin(bs);
   5349     ret = bdrv_replace_node(bs, new_node_bs, errp);
   5350     bdrv_drained_end(bs);
   5351 
   5352     if (ret < 0) {
   5353         error_prepend(errp, "Could not replace node: ");
   5354         goto fail;
   5355     }
   5356 
   5357     return new_node_bs;
   5358 
   5359 fail:
   5360     qobject_unref(options);
   5361     bdrv_unref(new_node_bs);
   5362     return NULL;
   5363 }
   5364 
   5365 /*
   5366  * Run consistency checks on an image
   5367  *
   5368  * Returns 0 if the check could be completed (it doesn't mean that the image is
   5369  * free of errors) or -errno when an internal error occurred. The results of the
   5370  * check are stored in res.
   5371  */
   5372 int coroutine_fn bdrv_co_check(BlockDriverState *bs,
   5373                                BdrvCheckResult *res, BdrvCheckMode fix)
   5374 {
   5375     IO_CODE();
   5376     if (bs->drv == NULL) {
   5377         return -ENOMEDIUM;
   5378     }
   5379     if (bs->drv->bdrv_co_check == NULL) {
   5380         return -ENOTSUP;
   5381     }
   5382 
   5383     memset(res, 0, sizeof(*res));
   5384     return bs->drv->bdrv_co_check(bs, res, fix);
   5385 }
   5386 
   5387 /*
   5388  * Return values:
   5389  * 0        - success
   5390  * -EINVAL  - backing format specified, but no file
   5391  * -ENOSPC  - can't update the backing file because no space is left in the
   5392  *            image file header
   5393  * -ENOTSUP - format driver doesn't support changing the backing file
   5394  */
   5395 int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file,
   5396                              const char *backing_fmt, bool require)
   5397 {
   5398     BlockDriver *drv = bs->drv;
   5399     int ret;
   5400 
   5401     GLOBAL_STATE_CODE();
   5402 
   5403     if (!drv) {
   5404         return -ENOMEDIUM;
   5405     }
   5406 
   5407     /* Backing file format doesn't make sense without a backing file */
   5408     if (backing_fmt && !backing_file) {
   5409         return -EINVAL;
   5410     }
   5411 
   5412     if (require && backing_file && !backing_fmt) {
   5413         return -EINVAL;
   5414     }
   5415 
   5416     if (drv->bdrv_change_backing_file != NULL) {
   5417         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
   5418     } else {
   5419         ret = -ENOTSUP;
   5420     }
   5421 
   5422     if (ret == 0) {
   5423         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
   5424         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
   5425         pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
   5426                 backing_file ?: "");
   5427     }
   5428     return ret;
   5429 }
   5430 
   5431 /*
   5432  * Finds the first non-filter node above bs in the chain between
   5433  * active and bs.  The returned node is either an immediate parent of
   5434  * bs, or there are only filter nodes between the two.
   5435  *
   5436  * Returns NULL if bs is not found in active's image chain,
   5437  * or if active == bs.
   5438  *
   5439  * Returns the bottommost base image if bs == NULL.
   5440  */
   5441 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
   5442                                     BlockDriverState *bs)
   5443 {
   5444 
   5445     GLOBAL_STATE_CODE();
   5446 
   5447     bs = bdrv_skip_filters(bs);
   5448     active = bdrv_skip_filters(active);
   5449 
   5450     while (active) {
   5451         BlockDriverState *next = bdrv_backing_chain_next(active);
   5452         if (bs == next) {
   5453             return active;
   5454         }
   5455         active = next;
   5456     }
   5457 
   5458     return NULL;
   5459 }
   5460 
   5461 /* Given a BDS, searches for the base layer. */
   5462 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
   5463 {
   5464     GLOBAL_STATE_CODE();
   5465 
   5466     return bdrv_find_overlay(bs, NULL);
   5467 }
   5468 
   5469 /*
   5470  * Return true if at least one of the COW (backing) and filter links
   5471  * between @bs and @base is frozen. @errp is set if that's the case.
   5472  * @base must be reachable from @bs, or NULL.
   5473  */
   5474 bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
   5475                                   Error **errp)
   5476 {
   5477     BlockDriverState *i;
   5478     BdrvChild *child;
   5479 
   5480     GLOBAL_STATE_CODE();
   5481 
   5482     for (i = bs; i != base; i = child_bs(child)) {
   5483         child = bdrv_filter_or_cow_child(i);
   5484 
   5485         if (child && child->frozen) {
   5486             error_setg(errp, "Cannot change '%s' link from '%s' to '%s'",
   5487                        child->name, i->node_name, child->bs->node_name);
   5488             return true;
   5489         }
   5490     }
   5491 
   5492     return false;
   5493 }
   5494 
   5495 /*
   5496  * Freeze all COW (backing) and filter links between @bs and @base.
   5497  * If any of the links is already frozen the operation is aborted and
   5498  * none of the links are modified.
   5499  * @base must be reachable from @bs, or NULL.
   5500  * Returns 0 on success. On failure returns < 0 and sets @errp.
   5501  */
   5502 int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base,
   5503                               Error **errp)
   5504 {
   5505     BlockDriverState *i;
   5506     BdrvChild *child;
   5507 
   5508     GLOBAL_STATE_CODE();
   5509 
   5510     if (bdrv_is_backing_chain_frozen(bs, base, errp)) {
   5511         return -EPERM;
   5512     }
   5513 
   5514     for (i = bs; i != base; i = child_bs(child)) {
   5515         child = bdrv_filter_or_cow_child(i);
   5516         if (child && child->bs->never_freeze) {
   5517             error_setg(errp, "Cannot freeze '%s' link to '%s'",
   5518                        child->name, child->bs->node_name);
   5519             return -EPERM;
   5520         }
   5521     }
   5522 
   5523     for (i = bs; i != base; i = child_bs(child)) {
   5524         child = bdrv_filter_or_cow_child(i);
   5525         if (child) {
   5526             child->frozen = true;
   5527         }
   5528     }
   5529 
   5530     return 0;
   5531 }
   5532 
   5533 /*
   5534  * Unfreeze all COW (backing) and filter links between @bs and @base.
   5535  * The caller must ensure that all links are frozen before using this
   5536  * function.
   5537  * @base must be reachable from @bs, or NULL.
   5538  */
   5539 void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base)
   5540 {
   5541     BlockDriverState *i;
   5542     BdrvChild *child;
   5543 
   5544     GLOBAL_STATE_CODE();
   5545 
   5546     for (i = bs; i != base; i = child_bs(child)) {
   5547         child = bdrv_filter_or_cow_child(i);
   5548         if (child) {
   5549             assert(child->frozen);
   5550             child->frozen = false;
   5551         }
   5552     }
   5553 }
   5554 
   5555 /*
   5556  * Drops images above 'base' up to and including 'top', and sets the image
   5557  * above 'top' to have base as its backing file.
   5558  *
   5559  * Requires that the overlay to 'top' is opened r/w, so that the backing file
   5560  * information in 'bs' can be properly updated.
   5561  *
   5562  * E.g., this will convert the following chain:
   5563  * bottom <- base <- intermediate <- top <- active
   5564  *
   5565  * to
   5566  *
   5567  * bottom <- base <- active
   5568  *
   5569  * It is allowed for bottom==base, in which case it converts:
   5570  *
   5571  * base <- intermediate <- top <- active
   5572  *
   5573  * to
   5574  *
   5575  * base <- active
   5576  *
   5577  * If backing_file_str is non-NULL, it will be used when modifying top's
   5578  * overlay image metadata.
   5579  *
   5580  * Error conditions:
   5581  *  if active == top, that is considered an error
   5582  *
   5583  */
   5584 int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
   5585                            const char *backing_file_str)
   5586 {
   5587     BlockDriverState *explicit_top = top;
   5588     bool update_inherits_from;
   5589     BdrvChild *c;
   5590     Error *local_err = NULL;
   5591     int ret = -EIO;
   5592     g_autoptr(GSList) updated_children = NULL;
   5593     GSList *p;
   5594 
   5595     GLOBAL_STATE_CODE();
   5596 
   5597     bdrv_ref(top);
   5598     bdrv_subtree_drained_begin(top);
   5599 
   5600     if (!top->drv || !base->drv) {
   5601         goto exit;
   5602     }
   5603 
   5604     /* Make sure that base is in the backing chain of top */
   5605     if (!bdrv_chain_contains(top, base)) {
   5606         goto exit;
   5607     }
   5608 
   5609     /* If 'base' recursively inherits from 'top' then we should set
   5610      * base->inherits_from to top->inherits_from after 'top' and all
   5611      * other intermediate nodes have been dropped.
   5612      * If 'top' is an implicit node (e.g. "commit_top") we should skip
   5613      * it because no one inherits from it. We use explicit_top for that. */
   5614     explicit_top = bdrv_skip_implicit_filters(explicit_top);
   5615     update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top);
   5616 
   5617     /* success - we can delete the intermediate states, and link top->base */
   5618     if (!backing_file_str) {
   5619         bdrv_refresh_filename(base);
   5620         backing_file_str = base->filename;
   5621     }
   5622 
   5623     QLIST_FOREACH(c, &top->parents, next_parent) {
   5624         updated_children = g_slist_prepend(updated_children, c);
   5625     }
   5626 
   5627     /*
   5628      * It seems correct to pass detach_subchain=true here, but it triggers
   5629      * one more yet not fixed bug, when due to nested aio_poll loop we switch to
   5630      * another drained section, which modify the graph (for example, removing
   5631      * the child, which we keep in updated_children list). So, it's a TODO.
   5632      *
   5633      * Note, bug triggered if pass detach_subchain=true here and run
   5634      * test-bdrv-drain. test_drop_intermediate_poll() test-case will crash.
   5635      * That's a FIXME.
   5636      */
   5637     bdrv_replace_node_common(top, base, false, false, &local_err);
   5638     if (local_err) {
   5639         error_report_err(local_err);
   5640         goto exit;
   5641     }
   5642 
   5643     for (p = updated_children; p; p = p->next) {
   5644         c = p->data;
   5645 
   5646         if (c->klass->update_filename) {
   5647             ret = c->klass->update_filename(c, base, backing_file_str,
   5648                                             &local_err);
   5649             if (ret < 0) {
   5650                 /*
   5651                  * TODO: Actually, we want to rollback all previous iterations
   5652                  * of this loop, and (which is almost impossible) previous
   5653                  * bdrv_replace_node()...
   5654                  *
   5655                  * Note, that c->klass->update_filename may lead to permission
   5656                  * update, so it's a bad idea to call it inside permission
   5657                  * update transaction of bdrv_replace_node.
   5658                  */
   5659                 error_report_err(local_err);
   5660                 goto exit;
   5661             }
   5662         }
   5663     }
   5664 
   5665     if (update_inherits_from) {
   5666         base->inherits_from = explicit_top->inherits_from;
   5667     }
   5668 
   5669     ret = 0;
   5670 exit:
   5671     bdrv_subtree_drained_end(top);
   5672     bdrv_unref(top);
   5673     return ret;
   5674 }
   5675 
   5676 /**
   5677  * Implementation of BlockDriver.bdrv_get_allocated_file_size() that
   5678  * sums the size of all data-bearing children.  (This excludes backing
   5679  * children.)
   5680  */
   5681 static int64_t bdrv_sum_allocated_file_size(BlockDriverState *bs)
   5682 {
   5683     BdrvChild *child;
   5684     int64_t child_size, sum = 0;
   5685 
   5686     QLIST_FOREACH(child, &bs->children, next) {
   5687         if (child->role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
   5688                            BDRV_CHILD_FILTERED))
   5689         {
   5690             child_size = bdrv_get_allocated_file_size(child->bs);
   5691             if (child_size < 0) {
   5692                 return child_size;
   5693             }
   5694             sum += child_size;
   5695         }
   5696     }
   5697 
   5698     return sum;
   5699 }
   5700 
   5701 /**
   5702  * Length of a allocated file in bytes. Sparse files are counted by actual
   5703  * allocated space. Return < 0 if error or unknown.
   5704  */
   5705 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
   5706 {
   5707     BlockDriver *drv = bs->drv;
   5708     IO_CODE();
   5709 
   5710     if (!drv) {
   5711         return -ENOMEDIUM;
   5712     }
   5713     if (drv->bdrv_get_allocated_file_size) {
   5714         return drv->bdrv_get_allocated_file_size(bs);
   5715     }
   5716 
   5717     if (drv->bdrv_file_open) {
   5718         /*
   5719          * Protocol drivers default to -ENOTSUP (most of their data is
   5720          * not stored in any of their children (if they even have any),
   5721          * so there is no generic way to figure it out).
   5722          */
   5723         return -ENOTSUP;
   5724     } else if (drv->is_filter) {
   5725         /* Filter drivers default to the size of their filtered child */
   5726         return bdrv_get_allocated_file_size(bdrv_filter_bs(bs));
   5727     } else {
   5728         /* Other drivers default to summing their children's sizes */
   5729         return bdrv_sum_allocated_file_size(bs);
   5730     }
   5731 }
   5732 
   5733 /*
   5734  * bdrv_measure:
   5735  * @drv: Format driver
   5736  * @opts: Creation options for new image
   5737  * @in_bs: Existing image containing data for new image (may be NULL)
   5738  * @errp: Error object
   5739  * Returns: A #BlockMeasureInfo (free using qapi_free_BlockMeasureInfo())
   5740  *          or NULL on error
   5741  *
   5742  * Calculate file size required to create a new image.
   5743  *
   5744  * If @in_bs is given then space for allocated clusters and zero clusters
   5745  * from that image are included in the calculation.  If @opts contains a
   5746  * backing file that is shared by @in_bs then backing clusters may be omitted
   5747  * from the calculation.
   5748  *
   5749  * If @in_bs is NULL then the calculation includes no allocated clusters
   5750  * unless a preallocation option is given in @opts.
   5751  *
   5752  * Note that @in_bs may use a different BlockDriver from @drv.
   5753  *
   5754  * If an error occurs the @errp pointer is set.
   5755  */
   5756 BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
   5757                                BlockDriverState *in_bs, Error **errp)
   5758 {
   5759     IO_CODE();
   5760     if (!drv->bdrv_measure) {
   5761         error_setg(errp, "Block driver '%s' does not support size measurement",
   5762                    drv->format_name);
   5763         return NULL;
   5764     }
   5765 
   5766     return drv->bdrv_measure(opts, in_bs, errp);
   5767 }
   5768 
   5769 /**
   5770  * Return number of sectors on success, -errno on error.
   5771  */
   5772 int64_t bdrv_nb_sectors(BlockDriverState *bs)
   5773 {
   5774     BlockDriver *drv = bs->drv;
   5775     IO_CODE();
   5776 
   5777     if (!drv)
   5778         return -ENOMEDIUM;
   5779 
   5780     if (drv->has_variable_length) {
   5781         int ret = refresh_total_sectors(bs, bs->total_sectors);
   5782         if (ret < 0) {
   5783             return ret;
   5784         }
   5785     }
   5786     return bs->total_sectors;
   5787 }
   5788 
   5789 /**
   5790  * Return length in bytes on success, -errno on error.
   5791  * The length is always a multiple of BDRV_SECTOR_SIZE.
   5792  */
   5793 int64_t bdrv_getlength(BlockDriverState *bs)
   5794 {
   5795     int64_t ret = bdrv_nb_sectors(bs);
   5796     IO_CODE();
   5797 
   5798     if (ret < 0) {
   5799         return ret;
   5800     }
   5801     if (ret > INT64_MAX / BDRV_SECTOR_SIZE) {
   5802         return -EFBIG;
   5803     }
   5804     return ret * BDRV_SECTOR_SIZE;
   5805 }
   5806 
   5807 /* return 0 as number of sectors if no device present or error */
   5808 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
   5809 {
   5810     int64_t nb_sectors = bdrv_nb_sectors(bs);
   5811     IO_CODE();
   5812 
   5813     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
   5814 }
   5815 
   5816 bool bdrv_is_sg(BlockDriverState *bs)
   5817 {
   5818     IO_CODE();
   5819     return bs->sg;
   5820 }
   5821 
   5822 /**
   5823  * Return whether the given node supports compressed writes.
   5824  */
   5825 bool bdrv_supports_compressed_writes(BlockDriverState *bs)
   5826 {
   5827     BlockDriverState *filtered;
   5828     IO_CODE();
   5829 
   5830     if (!bs->drv || !block_driver_can_compress(bs->drv)) {
   5831         return false;
   5832     }
   5833 
   5834     filtered = bdrv_filter_bs(bs);
   5835     if (filtered) {
   5836         /*
   5837          * Filters can only forward compressed writes, so we have to
   5838          * check the child.
   5839          */
   5840         return bdrv_supports_compressed_writes(filtered);
   5841     }
   5842 
   5843     return true;
   5844 }
   5845 
   5846 const char *bdrv_get_format_name(BlockDriverState *bs)
   5847 {
   5848     IO_CODE();
   5849     return bs->drv ? bs->drv->format_name : NULL;
   5850 }
   5851 
   5852 static int qsort_strcmp(const void *a, const void *b)
   5853 {
   5854     return strcmp(*(char *const *)a, *(char *const *)b);
   5855 }
   5856 
   5857 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
   5858                          void *opaque, bool read_only)
   5859 {
   5860     BlockDriver *drv;
   5861     int count = 0;
   5862     int i;
   5863     const char **formats = NULL;
   5864 
   5865     GLOBAL_STATE_CODE();
   5866 
   5867     QLIST_FOREACH(drv, &bdrv_drivers, list) {
   5868         if (drv->format_name) {
   5869             bool found = false;
   5870             int i = count;
   5871 
   5872             if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, read_only)) {
   5873                 continue;
   5874             }
   5875 
   5876             while (formats && i && !found) {
   5877                 found = !strcmp(formats[--i], drv->format_name);
   5878             }
   5879 
   5880             if (!found) {
   5881                 formats = g_renew(const char *, formats, count + 1);
   5882                 formats[count++] = drv->format_name;
   5883             }
   5884         }
   5885     }
   5886 
   5887     for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); i++) {
   5888         const char *format_name = block_driver_modules[i].format_name;
   5889 
   5890         if (format_name) {
   5891             bool found = false;
   5892             int j = count;
   5893 
   5894             if (use_bdrv_whitelist &&
   5895                 !bdrv_format_is_whitelisted(format_name, read_only)) {
   5896                 continue;
   5897             }
   5898 
   5899             while (formats && j && !found) {
   5900                 found = !strcmp(formats[--j], format_name);
   5901             }
   5902 
   5903             if (!found) {
   5904                 formats = g_renew(const char *, formats, count + 1);
   5905                 formats[count++] = format_name;
   5906             }
   5907         }
   5908     }
   5909 
   5910     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
   5911 
   5912     for (i = 0; i < count; i++) {
   5913         it(opaque, formats[i]);
   5914     }
   5915 
   5916     g_free(formats);
   5917 }
   5918 
   5919 /* This function is to find a node in the bs graph */
   5920 BlockDriverState *bdrv_find_node(const char *node_name)
   5921 {
   5922     BlockDriverState *bs;
   5923 
   5924     assert(node_name);
   5925     GLOBAL_STATE_CODE();
   5926 
   5927     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
   5928         if (!strcmp(node_name, bs->node_name)) {
   5929             return bs;
   5930         }
   5931     }
   5932     return NULL;
   5933 }
   5934 
   5935 /* Put this QMP function here so it can access the static graph_bdrv_states. */
   5936 BlockDeviceInfoList *bdrv_named_nodes_list(bool flat,
   5937                                            Error **errp)
   5938 {
   5939     BlockDeviceInfoList *list;
   5940     BlockDriverState *bs;
   5941 
   5942     GLOBAL_STATE_CODE();
   5943 
   5944     list = NULL;
   5945     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
   5946         BlockDeviceInfo *info = bdrv_block_device_info(NULL, bs, flat, errp);
   5947         if (!info) {
   5948             qapi_free_BlockDeviceInfoList(list);
   5949             return NULL;
   5950         }
   5951         QAPI_LIST_PREPEND(list, info);
   5952     }
   5953 
   5954     return list;
   5955 }
   5956 
   5957 typedef struct XDbgBlockGraphConstructor {
   5958     XDbgBlockGraph *graph;
   5959     GHashTable *graph_nodes;
   5960 } XDbgBlockGraphConstructor;
   5961 
   5962 static XDbgBlockGraphConstructor *xdbg_graph_new(void)
   5963 {
   5964     XDbgBlockGraphConstructor *gr = g_new(XDbgBlockGraphConstructor, 1);
   5965 
   5966     gr->graph = g_new0(XDbgBlockGraph, 1);
   5967     gr->graph_nodes = g_hash_table_new(NULL, NULL);
   5968 
   5969     return gr;
   5970 }
   5971 
   5972 static XDbgBlockGraph *xdbg_graph_finalize(XDbgBlockGraphConstructor *gr)
   5973 {
   5974     XDbgBlockGraph *graph = gr->graph;
   5975 
   5976     g_hash_table_destroy(gr->graph_nodes);
   5977     g_free(gr);
   5978 
   5979     return graph;
   5980 }
   5981 
   5982 static uintptr_t xdbg_graph_node_num(XDbgBlockGraphConstructor *gr, void *node)
   5983 {
   5984     uintptr_t ret = (uintptr_t)g_hash_table_lookup(gr->graph_nodes, node);
   5985 
   5986     if (ret != 0) {
   5987         return ret;
   5988     }
   5989 
   5990     /*
   5991      * Start counting from 1, not 0, because 0 interferes with not-found (NULL)
   5992      * answer of g_hash_table_lookup.
   5993      */
   5994     ret = g_hash_table_size(gr->graph_nodes) + 1;
   5995     g_hash_table_insert(gr->graph_nodes, node, (void *)ret);
   5996 
   5997     return ret;
   5998 }
   5999 
   6000 static void xdbg_graph_add_node(XDbgBlockGraphConstructor *gr, void *node,
   6001                                 XDbgBlockGraphNodeType type, const char *name)
   6002 {
   6003     XDbgBlockGraphNode *n;
   6004 
   6005     n = g_new0(XDbgBlockGraphNode, 1);
   6006 
   6007     n->id = xdbg_graph_node_num(gr, node);
   6008     n->type = type;
   6009     n->name = g_strdup(name);
   6010 
   6011     QAPI_LIST_PREPEND(gr->graph->nodes, n);
   6012 }
   6013 
   6014 static void xdbg_graph_add_edge(XDbgBlockGraphConstructor *gr, void *parent,
   6015                                 const BdrvChild *child)
   6016 {
   6017     BlockPermission qapi_perm;
   6018     XDbgBlockGraphEdge *edge;
   6019     GLOBAL_STATE_CODE();
   6020 
   6021     edge = g_new0(XDbgBlockGraphEdge, 1);
   6022 
   6023     edge->parent = xdbg_graph_node_num(gr, parent);
   6024     edge->child = xdbg_graph_node_num(gr, child->bs);
   6025     edge->name = g_strdup(child->name);
   6026 
   6027     for (qapi_perm = 0; qapi_perm < BLOCK_PERMISSION__MAX; qapi_perm++) {
   6028         uint64_t flag = bdrv_qapi_perm_to_blk_perm(qapi_perm);
   6029 
   6030         if (flag & child->perm) {
   6031             QAPI_LIST_PREPEND(edge->perm, qapi_perm);
   6032         }
   6033         if (flag & child->shared_perm) {
   6034             QAPI_LIST_PREPEND(edge->shared_perm, qapi_perm);
   6035         }
   6036     }
   6037 
   6038     QAPI_LIST_PREPEND(gr->graph->edges, edge);
   6039 }
   6040 
   6041 
   6042 XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp)
   6043 {
   6044     BlockBackend *blk;
   6045     BlockJob *job;
   6046     BlockDriverState *bs;
   6047     BdrvChild *child;
   6048     XDbgBlockGraphConstructor *gr = xdbg_graph_new();
   6049 
   6050     GLOBAL_STATE_CODE();
   6051 
   6052     for (blk = blk_all_next(NULL); blk; blk = blk_all_next(blk)) {
   6053         char *allocated_name = NULL;
   6054         const char *name = blk_name(blk);
   6055 
   6056         if (!*name) {
   6057             name = allocated_name = blk_get_attached_dev_id(blk);
   6058         }
   6059         xdbg_graph_add_node(gr, blk, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_BACKEND,
   6060                            name);
   6061         g_free(allocated_name);
   6062         if (blk_root(blk)) {
   6063             xdbg_graph_add_edge(gr, blk, blk_root(blk));
   6064         }
   6065     }
   6066 
   6067     WITH_JOB_LOCK_GUARD() {
   6068         for (job = block_job_next_locked(NULL); job;
   6069              job = block_job_next_locked(job)) {
   6070             GSList *el;
   6071 
   6072             xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
   6073                                 job->job.id);
   6074             for (el = job->nodes; el; el = el->next) {
   6075                 xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
   6076             }
   6077         }
   6078     }
   6079 
   6080     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
   6081         xdbg_graph_add_node(gr, bs, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_DRIVER,
   6082                            bs->node_name);
   6083         QLIST_FOREACH(child, &bs->children, next) {
   6084             xdbg_graph_add_edge(gr, bs, child);
   6085         }
   6086     }
   6087 
   6088     return xdbg_graph_finalize(gr);
   6089 }
   6090 
   6091 BlockDriverState *bdrv_lookup_bs(const char *device,
   6092                                  const char *node_name,
   6093                                  Error **errp)
   6094 {
   6095     BlockBackend *blk;
   6096     BlockDriverState *bs;
   6097 
   6098     GLOBAL_STATE_CODE();
   6099 
   6100     if (device) {
   6101         blk = blk_by_name(device);
   6102 
   6103         if (blk) {
   6104             bs = blk_bs(blk);
   6105             if (!bs) {
   6106                 error_setg(errp, "Device '%s' has no medium", device);
   6107             }
   6108 
   6109             return bs;
   6110         }
   6111     }
   6112 
   6113     if (node_name) {
   6114         bs = bdrv_find_node(node_name);
   6115 
   6116         if (bs) {
   6117             return bs;
   6118         }
   6119     }
   6120 
   6121     error_setg(errp, "Cannot find device=\'%s\' nor node-name=\'%s\'",
   6122                      device ? device : "",
   6123                      node_name ? node_name : "");
   6124     return NULL;
   6125 }
   6126 
   6127 /* If 'base' is in the same chain as 'top', return true. Otherwise,
   6128  * return false.  If either argument is NULL, return false. */
   6129 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
   6130 {
   6131 
   6132     GLOBAL_STATE_CODE();
   6133 
   6134     while (top && top != base) {
   6135         top = bdrv_filter_or_cow_bs(top);
   6136     }
   6137 
   6138     return top != NULL;
   6139 }
   6140 
   6141 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
   6142 {
   6143     GLOBAL_STATE_CODE();
   6144     if (!bs) {
   6145         return QTAILQ_FIRST(&graph_bdrv_states);
   6146     }
   6147     return QTAILQ_NEXT(bs, node_list);
   6148 }
   6149 
   6150 BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
   6151 {
   6152     GLOBAL_STATE_CODE();
   6153     if (!bs) {
   6154         return QTAILQ_FIRST(&all_bdrv_states);
   6155     }
   6156     return QTAILQ_NEXT(bs, bs_list);
   6157 }
   6158 
   6159 const char *bdrv_get_node_name(const BlockDriverState *bs)
   6160 {
   6161     IO_CODE();
   6162     return bs->node_name;
   6163 }
   6164 
   6165 const char *bdrv_get_parent_name(const BlockDriverState *bs)
   6166 {
   6167     BdrvChild *c;
   6168     const char *name;
   6169     IO_CODE();
   6170 
   6171     /* If multiple parents have a name, just pick the first one. */
   6172     QLIST_FOREACH(c, &bs->parents, next_parent) {
   6173         if (c->klass->get_name) {
   6174             name = c->klass->get_name(c);
   6175             if (name && *name) {
   6176                 return name;
   6177             }
   6178         }
   6179     }
   6180 
   6181     return NULL;
   6182 }
   6183 
   6184 /* TODO check what callers really want: bs->node_name or blk_name() */
   6185 const char *bdrv_get_device_name(const BlockDriverState *bs)
   6186 {
   6187     IO_CODE();
   6188     return bdrv_get_parent_name(bs) ?: "";
   6189 }
   6190 
   6191 /* This can be used to identify nodes that might not have a device
   6192  * name associated. Since node and device names live in the same
   6193  * namespace, the result is unambiguous. The exception is if both are
   6194  * absent, then this returns an empty (non-null) string. */
   6195 const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
   6196 {
   6197     IO_CODE();
   6198     return bdrv_get_parent_name(bs) ?: bs->node_name;
   6199 }
   6200 
   6201 int bdrv_get_flags(BlockDriverState *bs)
   6202 {
   6203     IO_CODE();
   6204     return bs->open_flags;
   6205 }
   6206 
   6207 int bdrv_has_zero_init_1(BlockDriverState *bs)
   6208 {
   6209     GLOBAL_STATE_CODE();
   6210     return 1;
   6211 }
   6212 
   6213 int bdrv_has_zero_init(BlockDriverState *bs)
   6214 {
   6215     BlockDriverState *filtered;
   6216     GLOBAL_STATE_CODE();
   6217 
   6218     if (!bs->drv) {
   6219         return 0;
   6220     }
   6221 
   6222     /* If BS is a copy on write image, it is initialized to
   6223        the contents of the base image, which may not be zeroes.  */
   6224     if (bdrv_cow_child(bs)) {
   6225         return 0;
   6226     }
   6227     if (bs->drv->bdrv_has_zero_init) {
   6228         return bs->drv->bdrv_has_zero_init(bs);
   6229     }
   6230 
   6231     filtered = bdrv_filter_bs(bs);
   6232     if (filtered) {
   6233         return bdrv_has_zero_init(filtered);
   6234     }
   6235 
   6236     /* safe default */
   6237     return 0;
   6238 }
   6239 
   6240 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
   6241 {
   6242     IO_CODE();
   6243     if (!(bs->open_flags & BDRV_O_UNMAP)) {
   6244         return false;
   6245     }
   6246 
   6247     return bs->supported_zero_flags & BDRV_REQ_MAY_UNMAP;
   6248 }
   6249 
   6250 void bdrv_get_backing_filename(BlockDriverState *bs,
   6251                                char *filename, int filename_size)
   6252 {
   6253     IO_CODE();
   6254     pstrcpy(filename, filename_size, bs->backing_file);
   6255 }
   6256 
   6257 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
   6258 {
   6259     int ret;
   6260     BlockDriver *drv = bs->drv;
   6261     IO_CODE();
   6262     /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
   6263     if (!drv) {
   6264         return -ENOMEDIUM;
   6265     }
   6266     if (!drv->bdrv_get_info) {
   6267         BlockDriverState *filtered = bdrv_filter_bs(bs);
   6268         if (filtered) {
   6269             return bdrv_get_info(filtered, bdi);
   6270         }
   6271         return -ENOTSUP;
   6272     }
   6273     memset(bdi, 0, sizeof(*bdi));
   6274     ret = drv->bdrv_get_info(bs, bdi);
   6275     if (ret < 0) {
   6276         return ret;
   6277     }
   6278 
   6279     if (bdi->cluster_size > BDRV_MAX_ALIGNMENT) {
   6280         return -EINVAL;
   6281     }
   6282 
   6283     return 0;
   6284 }
   6285 
   6286 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
   6287                                           Error **errp)
   6288 {
   6289     BlockDriver *drv = bs->drv;
   6290     IO_CODE();
   6291     if (drv && drv->bdrv_get_specific_info) {
   6292         return drv->bdrv_get_specific_info(bs, errp);
   6293     }
   6294     return NULL;
   6295 }
   6296 
   6297 BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs)
   6298 {
   6299     BlockDriver *drv = bs->drv;
   6300     IO_CODE();
   6301     if (!drv || !drv->bdrv_get_specific_stats) {
   6302         return NULL;
   6303     }
   6304     return drv->bdrv_get_specific_stats(bs);
   6305 }
   6306 
   6307 void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event)
   6308 {
   6309     IO_CODE();
   6310     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
   6311         return;
   6312     }
   6313 
   6314     bs->drv->bdrv_debug_event(bs, event);
   6315 }
   6316 
   6317 static BlockDriverState *bdrv_find_debug_node(BlockDriverState *bs)
   6318 {
   6319     GLOBAL_STATE_CODE();
   6320     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
   6321         bs = bdrv_primary_bs(bs);
   6322     }
   6323 
   6324     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
   6325         assert(bs->drv->bdrv_debug_remove_breakpoint);
   6326         return bs;
   6327     }
   6328 
   6329     return NULL;
   6330 }
   6331 
   6332 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
   6333                           const char *tag)
   6334 {
   6335     GLOBAL_STATE_CODE();
   6336     bs = bdrv_find_debug_node(bs);
   6337     if (bs) {
   6338         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
   6339     }
   6340 
   6341     return -ENOTSUP;
   6342 }
   6343 
   6344 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
   6345 {
   6346     GLOBAL_STATE_CODE();
   6347     bs = bdrv_find_debug_node(bs);
   6348     if (bs) {
   6349         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
   6350     }
   6351 
   6352     return -ENOTSUP;
   6353 }
   6354 
   6355 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
   6356 {
   6357     GLOBAL_STATE_CODE();
   6358     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
   6359         bs = bdrv_primary_bs(bs);
   6360     }
   6361 
   6362     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
   6363         return bs->drv->bdrv_debug_resume(bs, tag);
   6364     }
   6365 
   6366     return -ENOTSUP;
   6367 }
   6368 
   6369 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
   6370 {
   6371     GLOBAL_STATE_CODE();
   6372     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
   6373         bs = bdrv_primary_bs(bs);
   6374     }
   6375 
   6376     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
   6377         return bs->drv->bdrv_debug_is_suspended(bs, tag);
   6378     }
   6379 
   6380     return false;
   6381 }
   6382 
   6383 /* backing_file can either be relative, or absolute, or a protocol.  If it is
   6384  * relative, it must be relative to the chain.  So, passing in bs->filename
   6385  * from a BDS as backing_file should not be done, as that may be relative to
   6386  * the CWD rather than the chain. */
   6387 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
   6388         const char *backing_file)
   6389 {
   6390     char *filename_full = NULL;
   6391     char *backing_file_full = NULL;
   6392     char *filename_tmp = NULL;
   6393     int is_protocol = 0;
   6394     bool filenames_refreshed = false;
   6395     BlockDriverState *curr_bs = NULL;
   6396     BlockDriverState *retval = NULL;
   6397     BlockDriverState *bs_below;
   6398 
   6399     GLOBAL_STATE_CODE();
   6400 
   6401     if (!bs || !bs->drv || !backing_file) {
   6402         return NULL;
   6403     }
   6404 
   6405     filename_full     = g_malloc(PATH_MAX);
   6406     backing_file_full = g_malloc(PATH_MAX);
   6407 
   6408     is_protocol = path_has_protocol(backing_file);
   6409 
   6410     /*
   6411      * Being largely a legacy function, skip any filters here
   6412      * (because filters do not have normal filenames, so they cannot
   6413      * match anyway; and allowing json:{} filenames is a bit out of
   6414      * scope).
   6415      */
   6416     for (curr_bs = bdrv_skip_filters(bs);
   6417          bdrv_cow_child(curr_bs) != NULL;
   6418          curr_bs = bs_below)
   6419     {
   6420         bs_below = bdrv_backing_chain_next(curr_bs);
   6421 
   6422         if (bdrv_backing_overridden(curr_bs)) {
   6423             /*
   6424              * If the backing file was overridden, we can only compare
   6425              * directly against the backing node's filename.
   6426              */
   6427 
   6428             if (!filenames_refreshed) {
   6429                 /*
   6430                  * This will automatically refresh all of the
   6431                  * filenames in the rest of the backing chain, so we
   6432                  * only need to do this once.
   6433                  */
   6434                 bdrv_refresh_filename(bs_below);
   6435                 filenames_refreshed = true;
   6436             }
   6437 
   6438             if (strcmp(backing_file, bs_below->filename) == 0) {
   6439                 retval = bs_below;
   6440                 break;
   6441             }
   6442         } else if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
   6443             /*
   6444              * If either of the filename paths is actually a protocol, then
   6445              * compare unmodified paths; otherwise make paths relative.
   6446              */
   6447             char *backing_file_full_ret;
   6448 
   6449             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
   6450                 retval = bs_below;
   6451                 break;
   6452             }
   6453             /* Also check against the full backing filename for the image */
   6454             backing_file_full_ret = bdrv_get_full_backing_filename(curr_bs,
   6455                                                                    NULL);
   6456             if (backing_file_full_ret) {
   6457                 bool equal = strcmp(backing_file, backing_file_full_ret) == 0;
   6458                 g_free(backing_file_full_ret);
   6459                 if (equal) {
   6460                     retval = bs_below;
   6461                     break;
   6462                 }
   6463             }
   6464         } else {
   6465             /* If not an absolute filename path, make it relative to the current
   6466              * image's filename path */
   6467             filename_tmp = bdrv_make_absolute_filename(curr_bs, backing_file,
   6468                                                        NULL);
   6469             /* We are going to compare canonicalized absolute pathnames */
   6470             if (!filename_tmp || !realpath(filename_tmp, filename_full)) {
   6471                 g_free(filename_tmp);
   6472                 continue;
   6473             }
   6474             g_free(filename_tmp);
   6475 
   6476             /* We need to make sure the backing filename we are comparing against
   6477              * is relative to the current image filename (or absolute) */
   6478             filename_tmp = bdrv_get_full_backing_filename(curr_bs, NULL);
   6479             if (!filename_tmp || !realpath(filename_tmp, backing_file_full)) {
   6480                 g_free(filename_tmp);
   6481                 continue;
   6482             }
   6483             g_free(filename_tmp);
   6484 
   6485             if (strcmp(backing_file_full, filename_full) == 0) {
   6486                 retval = bs_below;
   6487                 break;
   6488             }
   6489         }
   6490     }
   6491 
   6492     g_free(filename_full);
   6493     g_free(backing_file_full);
   6494     return retval;
   6495 }
   6496 
   6497 void bdrv_init(void)
   6498 {
   6499 #ifdef CONFIG_BDRV_WHITELIST_TOOLS
   6500     use_bdrv_whitelist = 1;
   6501 #endif
   6502     module_call_init(MODULE_INIT_BLOCK);
   6503 }
   6504 
   6505 void bdrv_init_with_whitelist(void)
   6506 {
   6507     use_bdrv_whitelist = 1;
   6508     bdrv_init();
   6509 }
   6510 
   6511 int bdrv_activate(BlockDriverState *bs, Error **errp)
   6512 {
   6513     BdrvChild *child, *parent;
   6514     Error *local_err = NULL;
   6515     int ret;
   6516     BdrvDirtyBitmap *bm;
   6517 
   6518     GLOBAL_STATE_CODE();
   6519 
   6520     if (!bs->drv)  {
   6521         return -ENOMEDIUM;
   6522     }
   6523 
   6524     QLIST_FOREACH(child, &bs->children, next) {
   6525         bdrv_activate(child->bs, &local_err);
   6526         if (local_err) {
   6527             error_propagate(errp, local_err);
   6528             return -EINVAL;
   6529         }
   6530     }
   6531 
   6532     /*
   6533      * Update permissions, they may differ for inactive nodes.
   6534      *
   6535      * Note that the required permissions of inactive images are always a
   6536      * subset of the permissions required after activating the image. This
   6537      * allows us to just get the permissions upfront without restricting
   6538      * bdrv_co_invalidate_cache().
   6539      *
   6540      * It also means that in error cases, we don't have to try and revert to
   6541      * the old permissions (which is an operation that could fail, too). We can
   6542      * just keep the extended permissions for the next time that an activation
   6543      * of the image is tried.
   6544      */
   6545     if (bs->open_flags & BDRV_O_INACTIVE) {
   6546         bs->open_flags &= ~BDRV_O_INACTIVE;
   6547         ret = bdrv_refresh_perms(bs, errp);
   6548         if (ret < 0) {
   6549             bs->open_flags |= BDRV_O_INACTIVE;
   6550             return ret;
   6551         }
   6552 
   6553         ret = bdrv_invalidate_cache(bs, errp);
   6554         if (ret < 0) {
   6555             bs->open_flags |= BDRV_O_INACTIVE;
   6556             return ret;
   6557         }
   6558 
   6559         FOR_EACH_DIRTY_BITMAP(bs, bm) {
   6560             bdrv_dirty_bitmap_skip_store(bm, false);
   6561         }
   6562 
   6563         ret = refresh_total_sectors(bs, bs->total_sectors);
   6564         if (ret < 0) {
   6565             bs->open_flags |= BDRV_O_INACTIVE;
   6566             error_setg_errno(errp, -ret, "Could not refresh total sector count");
   6567             return ret;
   6568         }
   6569     }
   6570 
   6571     QLIST_FOREACH(parent, &bs->parents, next_parent) {
   6572         if (parent->klass->activate) {
   6573             parent->klass->activate(parent, &local_err);
   6574             if (local_err) {
   6575                 bs->open_flags |= BDRV_O_INACTIVE;
   6576                 error_propagate(errp, local_err);
   6577                 return -EINVAL;
   6578             }
   6579         }
   6580     }
   6581 
   6582     return 0;
   6583 }
   6584 
   6585 int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
   6586 {
   6587     Error *local_err = NULL;
   6588     IO_CODE();
   6589 
   6590     assert(!(bs->open_flags & BDRV_O_INACTIVE));
   6591 
   6592     if (bs->drv->bdrv_co_invalidate_cache) {
   6593         bs->drv->bdrv_co_invalidate_cache(bs, &local_err);
   6594         if (local_err) {
   6595             error_propagate(errp, local_err);
   6596             return -EINVAL;
   6597         }
   6598     }
   6599 
   6600     return 0;
   6601 }
   6602 
   6603 void bdrv_activate_all(Error **errp)
   6604 {
   6605     BlockDriverState *bs;
   6606     BdrvNextIterator it;
   6607 
   6608     GLOBAL_STATE_CODE();
   6609 
   6610     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
   6611         AioContext *aio_context = bdrv_get_aio_context(bs);
   6612         int ret;
   6613 
   6614         aio_context_acquire(aio_context);
   6615         ret = bdrv_activate(bs, errp);
   6616         aio_context_release(aio_context);
   6617         if (ret < 0) {
   6618             bdrv_next_cleanup(&it);
   6619             return;
   6620         }
   6621     }
   6622 }
   6623 
   6624 static bool bdrv_has_bds_parent(BlockDriverState *bs, bool only_active)
   6625 {
   6626     BdrvChild *parent;
   6627     GLOBAL_STATE_CODE();
   6628 
   6629     QLIST_FOREACH(parent, &bs->parents, next_parent) {
   6630         if (parent->klass->parent_is_bds) {
   6631             BlockDriverState *parent_bs = parent->opaque;
   6632             if (!only_active || !(parent_bs->open_flags & BDRV_O_INACTIVE)) {
   6633                 return true;
   6634             }
   6635         }
   6636     }
   6637 
   6638     return false;
   6639 }
   6640 
   6641 static int bdrv_inactivate_recurse(BlockDriverState *bs)
   6642 {
   6643     BdrvChild *child, *parent;
   6644     int ret;
   6645     uint64_t cumulative_perms, cumulative_shared_perms;
   6646 
   6647     GLOBAL_STATE_CODE();
   6648 
   6649     if (!bs->drv) {
   6650         return -ENOMEDIUM;
   6651     }
   6652 
   6653     /* Make sure that we don't inactivate a child before its parent.
   6654      * It will be covered by recursion from the yet active parent. */
   6655     if (bdrv_has_bds_parent(bs, true)) {
   6656         return 0;
   6657     }
   6658 
   6659     assert(!(bs->open_flags & BDRV_O_INACTIVE));
   6660 
   6661     /* Inactivate this node */
   6662     if (bs->drv->bdrv_inactivate) {
   6663         ret = bs->drv->bdrv_inactivate(bs);
   6664         if (ret < 0) {
   6665             return ret;
   6666         }
   6667     }
   6668 
   6669     QLIST_FOREACH(parent, &bs->parents, next_parent) {
   6670         if (parent->klass->inactivate) {
   6671             ret = parent->klass->inactivate(parent);
   6672             if (ret < 0) {
   6673                 return ret;
   6674             }
   6675         }
   6676     }
   6677 
   6678     bdrv_get_cumulative_perm(bs, &cumulative_perms,
   6679                              &cumulative_shared_perms);
   6680     if (cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
   6681         /* Our inactive parents still need write access. Inactivation failed. */
   6682         return -EPERM;
   6683     }
   6684 
   6685     bs->open_flags |= BDRV_O_INACTIVE;
   6686 
   6687     /*
   6688      * Update permissions, they may differ for inactive nodes.
   6689      * We only tried to loosen restrictions, so errors are not fatal, ignore
   6690      * them.
   6691      */
   6692     bdrv_refresh_perms(bs, NULL);
   6693 
   6694     /* Recursively inactivate children */
   6695     QLIST_FOREACH(child, &bs->children, next) {
   6696         ret = bdrv_inactivate_recurse(child->bs);
   6697         if (ret < 0) {
   6698             return ret;
   6699         }
   6700     }
   6701 
   6702     return 0;
   6703 }
   6704 
   6705 int bdrv_inactivate_all(void)
   6706 {
   6707     BlockDriverState *bs = NULL;
   6708     BdrvNextIterator it;
   6709     int ret = 0;
   6710     GSList *aio_ctxs = NULL, *ctx;
   6711 
   6712     GLOBAL_STATE_CODE();
   6713 
   6714     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
   6715         AioContext *aio_context = bdrv_get_aio_context(bs);
   6716 
   6717         if (!g_slist_find(aio_ctxs, aio_context)) {
   6718             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
   6719             aio_context_acquire(aio_context);
   6720         }
   6721     }
   6722 
   6723     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
   6724         /* Nodes with BDS parents are covered by recursion from the last
   6725          * parent that gets inactivated. Don't inactivate them a second
   6726          * time if that has already happened. */
   6727         if (bdrv_has_bds_parent(bs, false)) {
   6728             continue;
   6729         }
   6730         ret = bdrv_inactivate_recurse(bs);
   6731         if (ret < 0) {
   6732             bdrv_next_cleanup(&it);
   6733             goto out;
   6734         }
   6735     }
   6736 
   6737 out:
   6738     for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
   6739         AioContext *aio_context = ctx->data;
   6740         aio_context_release(aio_context);
   6741     }
   6742     g_slist_free(aio_ctxs);
   6743 
   6744     return ret;
   6745 }
   6746 
   6747 /**************************************************************/
   6748 /* removable device support */
   6749 
   6750 /**
   6751  * Return TRUE if the media is present
   6752  */
   6753 bool bdrv_is_inserted(BlockDriverState *bs)
   6754 {
   6755     BlockDriver *drv = bs->drv;
   6756     BdrvChild *child;
   6757     IO_CODE();
   6758 
   6759     if (!drv) {
   6760         return false;
   6761     }
   6762     if (drv->bdrv_is_inserted) {
   6763         return drv->bdrv_is_inserted(bs);
   6764     }
   6765     QLIST_FOREACH(child, &bs->children, next) {
   6766         if (!bdrv_is_inserted(child->bs)) {
   6767             return false;
   6768         }
   6769     }
   6770     return true;
   6771 }
   6772 
   6773 /**
   6774  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
   6775  */
   6776 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
   6777 {
   6778     BlockDriver *drv = bs->drv;
   6779     IO_CODE();
   6780 
   6781     if (drv && drv->bdrv_eject) {
   6782         drv->bdrv_eject(bs, eject_flag);
   6783     }
   6784 }
   6785 
   6786 /**
   6787  * Lock or unlock the media (if it is locked, the user won't be able
   6788  * to eject it manually).
   6789  */
   6790 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
   6791 {
   6792     BlockDriver *drv = bs->drv;
   6793     IO_CODE();
   6794     trace_bdrv_lock_medium(bs, locked);
   6795 
   6796     if (drv && drv->bdrv_lock_medium) {
   6797         drv->bdrv_lock_medium(bs, locked);
   6798     }
   6799 }
   6800 
   6801 /* Get a reference to bs */
   6802 void bdrv_ref(BlockDriverState *bs)
   6803 {
   6804     GLOBAL_STATE_CODE();
   6805     bs->refcnt++;
   6806 }
   6807 
   6808 /* Release a previously grabbed reference to bs.
   6809  * If after releasing, reference count is zero, the BlockDriverState is
   6810  * deleted. */
   6811 void bdrv_unref(BlockDriverState *bs)
   6812 {
   6813     GLOBAL_STATE_CODE();
   6814     if (!bs) {
   6815         return;
   6816     }
   6817     assert(bs->refcnt > 0);
   6818     if (--bs->refcnt == 0) {
   6819         bdrv_delete(bs);
   6820     }
   6821 }
   6822 
   6823 struct BdrvOpBlocker {
   6824     Error *reason;
   6825     QLIST_ENTRY(BdrvOpBlocker) list;
   6826 };
   6827 
   6828 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
   6829 {
   6830     BdrvOpBlocker *blocker;
   6831     GLOBAL_STATE_CODE();
   6832     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
   6833     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
   6834         blocker = QLIST_FIRST(&bs->op_blockers[op]);
   6835         error_propagate_prepend(errp, error_copy(blocker->reason),
   6836                                 "Node '%s' is busy: ",
   6837                                 bdrv_get_device_or_node_name(bs));
   6838         return true;
   6839     }
   6840     return false;
   6841 }
   6842 
   6843 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
   6844 {
   6845     BdrvOpBlocker *blocker;
   6846     GLOBAL_STATE_CODE();
   6847     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
   6848 
   6849     blocker = g_new0(BdrvOpBlocker, 1);
   6850     blocker->reason = reason;
   6851     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
   6852 }
   6853 
   6854 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
   6855 {
   6856     BdrvOpBlocker *blocker, *next;
   6857     GLOBAL_STATE_CODE();
   6858     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
   6859     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
   6860         if (blocker->reason == reason) {
   6861             QLIST_REMOVE(blocker, list);
   6862             g_free(blocker);
   6863         }
   6864     }
   6865 }
   6866 
   6867 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
   6868 {
   6869     int i;
   6870     GLOBAL_STATE_CODE();
   6871     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
   6872         bdrv_op_block(bs, i, reason);
   6873     }
   6874 }
   6875 
   6876 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
   6877 {
   6878     int i;
   6879     GLOBAL_STATE_CODE();
   6880     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
   6881         bdrv_op_unblock(bs, i, reason);
   6882     }
   6883 }
   6884 
   6885 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
   6886 {
   6887     int i;
   6888     GLOBAL_STATE_CODE();
   6889     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
   6890         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
   6891             return false;
   6892         }
   6893     }
   6894     return true;
   6895 }
   6896 
   6897 void bdrv_img_create(const char *filename, const char *fmt,
   6898                      const char *base_filename, const char *base_fmt,
   6899                      char *options, uint64_t img_size, int flags, bool quiet,
   6900                      Error **errp)
   6901 {
   6902     QemuOptsList *create_opts = NULL;
   6903     QemuOpts *opts = NULL;
   6904     const char *backing_fmt, *backing_file;
   6905     int64_t size;
   6906     BlockDriver *drv, *proto_drv;
   6907     Error *local_err = NULL;
   6908     int ret = 0;
   6909 
   6910     GLOBAL_STATE_CODE();
   6911 
   6912     /* Find driver and parse its options */
   6913     drv = bdrv_find_format(fmt);
   6914     if (!drv) {
   6915         error_setg(errp, "Unknown file format '%s'", fmt);
   6916         return;
   6917     }
   6918 
   6919     proto_drv = bdrv_find_protocol(filename, true, errp);
   6920     if (!proto_drv) {
   6921         return;
   6922     }
   6923 
   6924     if (!drv->create_opts) {
   6925         error_setg(errp, "Format driver '%s' does not support image creation",
   6926                    drv->format_name);
   6927         return;
   6928     }
   6929 
   6930     if (!proto_drv->create_opts) {
   6931         error_setg(errp, "Protocol driver '%s' does not support image creation",
   6932                    proto_drv->format_name);
   6933         return;
   6934     }
   6935 
   6936     /* Create parameter list */
   6937     create_opts = qemu_opts_append(create_opts, drv->create_opts);
   6938     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
   6939 
   6940     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
   6941 
   6942     /* Parse -o options */
   6943     if (options) {
   6944         if (!qemu_opts_do_parse(opts, options, NULL, errp)) {
   6945             goto out;
   6946         }
   6947     }
   6948 
   6949     if (!qemu_opt_get(opts, BLOCK_OPT_SIZE)) {
   6950         qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
   6951     } else if (img_size != UINT64_C(-1)) {
   6952         error_setg(errp, "The image size must be specified only once");
   6953         goto out;
   6954     }
   6955 
   6956     if (base_filename) {
   6957         if (!qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename,
   6958                           NULL)) {
   6959             error_setg(errp, "Backing file not supported for file format '%s'",
   6960                        fmt);
   6961             goto out;
   6962         }
   6963     }
   6964 
   6965     if (base_fmt) {
   6966         if (!qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, NULL)) {
   6967             error_setg(errp, "Backing file format not supported for file "
   6968                              "format '%s'", fmt);
   6969             goto out;
   6970         }
   6971     }
   6972 
   6973     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
   6974     if (backing_file) {
   6975         if (!strcmp(filename, backing_file)) {
   6976             error_setg(errp, "Error: Trying to create an image with the "
   6977                              "same filename as the backing file");
   6978             goto out;
   6979         }
   6980         if (backing_file[0] == '\0') {
   6981             error_setg(errp, "Expected backing file name, got empty string");
   6982             goto out;
   6983         }
   6984     }
   6985 
   6986     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
   6987 
   6988     /* The size for the image must always be specified, unless we have a backing
   6989      * file and we have not been forbidden from opening it. */
   6990     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, img_size);
   6991     if (backing_file && !(flags & BDRV_O_NO_BACKING)) {
   6992         BlockDriverState *bs;
   6993         char *full_backing;
   6994         int back_flags;
   6995         QDict *backing_options = NULL;
   6996 
   6997         full_backing =
   6998             bdrv_get_full_backing_filename_from_filename(filename, backing_file,
   6999                                                          &local_err);
   7000         if (local_err) {
   7001             goto out;
   7002         }
   7003         assert(full_backing);
   7004 
   7005         /*
   7006          * No need to do I/O here, which allows us to open encrypted
   7007          * backing images without needing the secret
   7008          */
   7009         back_flags = flags;
   7010         back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
   7011         back_flags |= BDRV_O_NO_IO;
   7012 
   7013         backing_options = qdict_new();
   7014         if (backing_fmt) {
   7015             qdict_put_str(backing_options, "driver", backing_fmt);
   7016         }
   7017         qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
   7018 
   7019         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
   7020                        &local_err);
   7021         g_free(full_backing);
   7022         if (!bs) {
   7023             error_append_hint(&local_err, "Could not open backing image.\n");
   7024             goto out;
   7025         } else {
   7026             if (!backing_fmt) {
   7027                 error_setg(&local_err,
   7028                            "Backing file specified without backing format");
   7029                 error_append_hint(&local_err, "Detected format of %s.",
   7030                                   bs->drv->format_name);
   7031                 goto out;
   7032             }
   7033             if (size == -1) {
   7034                 /* Opened BS, have no size */
   7035                 size = bdrv_getlength(bs);
   7036                 if (size < 0) {
   7037                     error_setg_errno(errp, -size, "Could not get size of '%s'",
   7038                                      backing_file);
   7039                     bdrv_unref(bs);
   7040                     goto out;
   7041                 }
   7042                 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
   7043             }
   7044             bdrv_unref(bs);
   7045         }
   7046         /* (backing_file && !(flags & BDRV_O_NO_BACKING)) */
   7047     } else if (backing_file && !backing_fmt) {
   7048         error_setg(&local_err,
   7049                    "Backing file specified without backing format");
   7050         goto out;
   7051     }
   7052 
   7053     if (size == -1) {
   7054         error_setg(errp, "Image creation needs a size parameter");
   7055         goto out;
   7056     }
   7057 
   7058     if (!quiet) {
   7059         printf("Formatting '%s', fmt=%s ", filename, fmt);
   7060         qemu_opts_print(opts, " ");
   7061         puts("");
   7062         fflush(stdout);
   7063     }
   7064 
   7065     ret = bdrv_create(drv, filename, opts, &local_err);
   7066 
   7067     if (ret == -EFBIG) {
   7068         /* This is generally a better message than whatever the driver would
   7069          * deliver (especially because of the cluster_size_hint), since that
   7070          * is most probably not much different from "image too large". */
   7071         const char *cluster_size_hint = "";
   7072         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
   7073             cluster_size_hint = " (try using a larger cluster size)";
   7074         }
   7075         error_setg(errp, "The image size is too large for file format '%s'"
   7076                    "%s", fmt, cluster_size_hint);
   7077         error_free(local_err);
   7078         local_err = NULL;
   7079     }
   7080 
   7081 out:
   7082     qemu_opts_del(opts);
   7083     qemu_opts_free(create_opts);
   7084     error_propagate(errp, local_err);
   7085 }
   7086 
   7087 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
   7088 {
   7089     IO_CODE();
   7090     return bs ? bs->aio_context : qemu_get_aio_context();
   7091 }
   7092 
   7093 AioContext *coroutine_fn bdrv_co_enter(BlockDriverState *bs)
   7094 {
   7095     Coroutine *self = qemu_coroutine_self();
   7096     AioContext *old_ctx = qemu_coroutine_get_aio_context(self);
   7097     AioContext *new_ctx;
   7098     IO_CODE();
   7099 
   7100     /*
   7101      * Increase bs->in_flight to ensure that this operation is completed before
   7102      * moving the node to a different AioContext. Read new_ctx only afterwards.
   7103      */
   7104     bdrv_inc_in_flight(bs);
   7105 
   7106     new_ctx = bdrv_get_aio_context(bs);
   7107     aio_co_reschedule_self(new_ctx);
   7108     return old_ctx;
   7109 }
   7110 
   7111 void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx)
   7112 {
   7113     IO_CODE();
   7114     aio_co_reschedule_self(old_ctx);
   7115     bdrv_dec_in_flight(bs);
   7116 }
   7117 
   7118 void coroutine_fn bdrv_co_lock(BlockDriverState *bs)
   7119 {
   7120     AioContext *ctx = bdrv_get_aio_context(bs);
   7121 
   7122     /* In the main thread, bs->aio_context won't change concurrently */
   7123     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
   7124 
   7125     /*
   7126      * We're in coroutine context, so we already hold the lock of the main
   7127      * loop AioContext. Don't lock it twice to avoid deadlocks.
   7128      */
   7129     assert(qemu_in_coroutine());
   7130     if (ctx != qemu_get_aio_context()) {
   7131         aio_context_acquire(ctx);
   7132     }
   7133 }
   7134 
   7135 void coroutine_fn bdrv_co_unlock(BlockDriverState *bs)
   7136 {
   7137     AioContext *ctx = bdrv_get_aio_context(bs);
   7138 
   7139     assert(qemu_in_coroutine());
   7140     if (ctx != qemu_get_aio_context()) {
   7141         aio_context_release(ctx);
   7142     }
   7143 }
   7144 
   7145 void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co)
   7146 {
   7147     IO_CODE();
   7148     aio_co_enter(bdrv_get_aio_context(bs), co);
   7149 }
   7150 
   7151 static void bdrv_do_remove_aio_context_notifier(BdrvAioNotifier *ban)
   7152 {
   7153     GLOBAL_STATE_CODE();
   7154     QLIST_REMOVE(ban, list);
   7155     g_free(ban);
   7156 }
   7157 
   7158 static void bdrv_detach_aio_context(BlockDriverState *bs)
   7159 {
   7160     BdrvAioNotifier *baf, *baf_tmp;
   7161 
   7162     assert(!bs->walking_aio_notifiers);
   7163     GLOBAL_STATE_CODE();
   7164     bs->walking_aio_notifiers = true;
   7165     QLIST_FOREACH_SAFE(baf, &bs->aio_notifiers, list, baf_tmp) {
   7166         if (baf->deleted) {
   7167             bdrv_do_remove_aio_context_notifier(baf);
   7168         } else {
   7169             baf->detach_aio_context(baf->opaque);
   7170         }
   7171     }
   7172     /* Never mind iterating again to check for ->deleted.  bdrv_close() will
   7173      * remove remaining aio notifiers if we aren't called again.
   7174      */
   7175     bs->walking_aio_notifiers = false;
   7176 
   7177     if (bs->drv && bs->drv->bdrv_detach_aio_context) {
   7178         bs->drv->bdrv_detach_aio_context(bs);
   7179     }
   7180 
   7181     if (bs->quiesce_counter) {
   7182         aio_enable_external(bs->aio_context);
   7183     }
   7184     assert_bdrv_graph_writable(bs);
   7185     bs->aio_context = NULL;
   7186 }
   7187 
   7188 static void bdrv_attach_aio_context(BlockDriverState *bs,
   7189                                     AioContext *new_context)
   7190 {
   7191     BdrvAioNotifier *ban, *ban_tmp;
   7192     GLOBAL_STATE_CODE();
   7193 
   7194     if (bs->quiesce_counter) {
   7195         aio_disable_external(new_context);
   7196     }
   7197 
   7198     assert_bdrv_graph_writable(bs);
   7199     bs->aio_context = new_context;
   7200 
   7201     if (bs->drv && bs->drv->bdrv_attach_aio_context) {
   7202         bs->drv->bdrv_attach_aio_context(bs, new_context);
   7203     }
   7204 
   7205     assert(!bs->walking_aio_notifiers);
   7206     bs->walking_aio_notifiers = true;
   7207     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_tmp) {
   7208         if (ban->deleted) {
   7209             bdrv_do_remove_aio_context_notifier(ban);
   7210         } else {
   7211             ban->attached_aio_context(new_context, ban->opaque);
   7212         }
   7213     }
   7214     bs->walking_aio_notifiers = false;
   7215 }
   7216 
   7217 typedef struct BdrvStateSetAioContext {
   7218     AioContext *new_ctx;
   7219     BlockDriverState *bs;
   7220 } BdrvStateSetAioContext;
   7221 
   7222 static bool bdrv_parent_change_aio_context(BdrvChild *c, AioContext *ctx,
   7223                                            GHashTable *visited,
   7224                                            Transaction *tran,
   7225                                            Error **errp)
   7226 {
   7227     GLOBAL_STATE_CODE();
   7228     if (g_hash_table_contains(visited, c)) {
   7229         return true;
   7230     }
   7231     g_hash_table_add(visited, c);
   7232 
   7233     /*
   7234      * A BdrvChildClass that doesn't handle AioContext changes cannot
   7235      * tolerate any AioContext changes
   7236      */
   7237     if (!c->klass->change_aio_ctx) {
   7238         char *user = bdrv_child_user_desc(c);
   7239         error_setg(errp, "Changing iothreads is not supported by %s", user);
   7240         g_free(user);
   7241         return false;
   7242     }
   7243     if (!c->klass->change_aio_ctx(c, ctx, visited, tran, errp)) {
   7244         assert(!errp || *errp);
   7245         return false;
   7246     }
   7247     return true;
   7248 }
   7249 
   7250 bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx,
   7251                                    GHashTable *visited, Transaction *tran,
   7252                                    Error **errp)
   7253 {
   7254     GLOBAL_STATE_CODE();
   7255     if (g_hash_table_contains(visited, c)) {
   7256         return true;
   7257     }
   7258     g_hash_table_add(visited, c);
   7259     return bdrv_change_aio_context(c->bs, ctx, visited, tran, errp);
   7260 }
   7261 
   7262 static void bdrv_set_aio_context_clean(void *opaque)
   7263 {
   7264     BdrvStateSetAioContext *state = (BdrvStateSetAioContext *) opaque;
   7265     BlockDriverState *bs = (BlockDriverState *) state->bs;
   7266 
   7267     /* Paired with bdrv_drained_begin in bdrv_change_aio_context() */
   7268     bdrv_drained_end(bs);
   7269 
   7270     g_free(state);
   7271 }
   7272 
   7273 static void bdrv_set_aio_context_commit(void *opaque)
   7274 {
   7275     BdrvStateSetAioContext *state = (BdrvStateSetAioContext *) opaque;
   7276     BlockDriverState *bs = (BlockDriverState *) state->bs;
   7277     AioContext *new_context = state->new_ctx;
   7278     AioContext *old_context = bdrv_get_aio_context(bs);
   7279     assert_bdrv_graph_writable(bs);
   7280 
   7281     /*
   7282      * Take the old AioContex when detaching it from bs.
   7283      * At this point, new_context lock is already acquired, and we are now
   7284      * also taking old_context. This is safe as long as bdrv_detach_aio_context
   7285      * does not call AIO_POLL_WHILE().
   7286      */
   7287     if (old_context != qemu_get_aio_context()) {
   7288         aio_context_acquire(old_context);
   7289     }
   7290     bdrv_detach_aio_context(bs);
   7291     if (old_context != qemu_get_aio_context()) {
   7292         aio_context_release(old_context);
   7293     }
   7294     bdrv_attach_aio_context(bs, new_context);
   7295 }
   7296 
   7297 static TransactionActionDrv set_aio_context = {
   7298     .commit = bdrv_set_aio_context_commit,
   7299     .clean = bdrv_set_aio_context_clean,
   7300 };
   7301 
   7302 /*
   7303  * Changes the AioContext used for fd handlers, timers, and BHs by this
   7304  * BlockDriverState and all its children and parents.
   7305  *
   7306  * Must be called from the main AioContext.
   7307  *
   7308  * The caller must own the AioContext lock for the old AioContext of bs, but it
   7309  * must not own the AioContext lock for new_context (unless new_context is the
   7310  * same as the current context of bs).
   7311  *
   7312  * @visited will accumulate all visited BdrvChild objects. The caller is
   7313  * responsible for freeing the list afterwards.
   7314  */
   7315 static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
   7316                                     GHashTable *visited, Transaction *tran,
   7317                                     Error **errp)
   7318 {
   7319     BdrvChild *c;
   7320     BdrvStateSetAioContext *state;
   7321 
   7322     GLOBAL_STATE_CODE();
   7323 
   7324     if (bdrv_get_aio_context(bs) == ctx) {
   7325         return true;
   7326     }
   7327 
   7328     QLIST_FOREACH(c, &bs->parents, next_parent) {
   7329         if (!bdrv_parent_change_aio_context(c, ctx, visited, tran, errp)) {
   7330             return false;
   7331         }
   7332     }
   7333 
   7334     QLIST_FOREACH(c, &bs->children, next) {
   7335         if (!bdrv_child_change_aio_context(c, ctx, visited, tran, errp)) {
   7336             return false;
   7337         }
   7338     }
   7339 
   7340     state = g_new(BdrvStateSetAioContext, 1);
   7341     *state = (BdrvStateSetAioContext) {
   7342         .new_ctx = ctx,
   7343         .bs = bs,
   7344     };
   7345 
   7346     /* Paired with bdrv_drained_end in bdrv_set_aio_context_clean() */
   7347     bdrv_drained_begin(bs);
   7348 
   7349     tran_add(tran, &set_aio_context, state);
   7350 
   7351     return true;
   7352 }
   7353 
   7354 /*
   7355  * Change bs's and recursively all of its parents' and children's AioContext
   7356  * to the given new context, returning an error if that isn't possible.
   7357  *
   7358  * If ignore_child is not NULL, that child (and its subgraph) will not
   7359  * be touched.
   7360  *
   7361  * This function still requires the caller to take the bs current
   7362  * AioContext lock, otherwise draining will fail since AIO_WAIT_WHILE
   7363  * assumes the lock is always held if bs is in another AioContext.
   7364  * For the same reason, it temporarily also holds the new AioContext, since
   7365  * bdrv_drained_end calls BDRV_POLL_WHILE that assumes the lock is taken too.
   7366  * Therefore the new AioContext lock must not be taken by the caller.
   7367  */
   7368 int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,
   7369                                 BdrvChild *ignore_child, Error **errp)
   7370 {
   7371     Transaction *tran;
   7372     GHashTable *visited;
   7373     int ret;
   7374     AioContext *old_context = bdrv_get_aio_context(bs);
   7375     GLOBAL_STATE_CODE();
   7376 
   7377     /*
   7378      * Recursion phase: go through all nodes of the graph.
   7379      * Take care of checking that all nodes support changing AioContext
   7380      * and drain them, builing a linear list of callbacks to run if everything
   7381      * is successful (the transaction itself).
   7382      */
   7383     tran = tran_new();
   7384     visited = g_hash_table_new(NULL, NULL);
   7385     if (ignore_child) {
   7386         g_hash_table_add(visited, ignore_child);
   7387     }
   7388     ret = bdrv_change_aio_context(bs, ctx, visited, tran, errp);
   7389     g_hash_table_destroy(visited);
   7390 
   7391     /*
   7392      * Linear phase: go through all callbacks collected in the transaction.
   7393      * Run all callbacks collected in the recursion to switch all nodes
   7394      * AioContext lock (transaction commit), or undo all changes done in the
   7395      * recursion (transaction abort).
   7396      */
   7397 
   7398     if (!ret) {
   7399         /* Just run clean() callbacks. No AioContext changed. */
   7400         tran_abort(tran);
   7401         return -EPERM;
   7402     }
   7403 
   7404     /*
   7405      * Release old AioContext, it won't be needed anymore, as all
   7406      * bdrv_drained_begin() have been called already.
   7407      */
   7408     if (qemu_get_aio_context() != old_context) {
   7409         aio_context_release(old_context);
   7410     }
   7411 
   7412     /*
   7413      * Acquire new AioContext since bdrv_drained_end() is going to be called
   7414      * after we switched all nodes in the new AioContext, and the function
   7415      * assumes that the lock of the bs is always taken.
   7416      */
   7417     if (qemu_get_aio_context() != ctx) {
   7418         aio_context_acquire(ctx);
   7419     }
   7420 
   7421     tran_commit(tran);
   7422 
   7423     if (qemu_get_aio_context() != ctx) {
   7424         aio_context_release(ctx);
   7425     }
   7426 
   7427     /* Re-acquire the old AioContext, since the caller takes and releases it. */
   7428     if (qemu_get_aio_context() != old_context) {
   7429         aio_context_acquire(old_context);
   7430     }
   7431 
   7432     return 0;
   7433 }
   7434 
   7435 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
   7436         void (*attached_aio_context)(AioContext *new_context, void *opaque),
   7437         void (*detach_aio_context)(void *opaque), void *opaque)
   7438 {
   7439     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
   7440     *ban = (BdrvAioNotifier){
   7441         .attached_aio_context = attached_aio_context,
   7442         .detach_aio_context   = detach_aio_context,
   7443         .opaque               = opaque
   7444     };
   7445     GLOBAL_STATE_CODE();
   7446 
   7447     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
   7448 }
   7449 
   7450 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
   7451                                       void (*attached_aio_context)(AioContext *,
   7452                                                                    void *),
   7453                                       void (*detach_aio_context)(void *),
   7454                                       void *opaque)
   7455 {
   7456     BdrvAioNotifier *ban, *ban_next;
   7457     GLOBAL_STATE_CODE();
   7458 
   7459     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
   7460         if (ban->attached_aio_context == attached_aio_context &&
   7461             ban->detach_aio_context   == detach_aio_context   &&
   7462             ban->opaque               == opaque               &&
   7463             ban->deleted              == false)
   7464         {
   7465             if (bs->walking_aio_notifiers) {
   7466                 ban->deleted = true;
   7467             } else {
   7468                 bdrv_do_remove_aio_context_notifier(ban);
   7469             }
   7470             return;
   7471         }
   7472     }
   7473 
   7474     abort();
   7475 }
   7476 
   7477 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
   7478                        BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
   7479                        bool force,
   7480                        Error **errp)
   7481 {
   7482     GLOBAL_STATE_CODE();
   7483     if (!bs->drv) {
   7484         error_setg(errp, "Node is ejected");
   7485         return -ENOMEDIUM;
   7486     }
   7487     if (!bs->drv->bdrv_amend_options) {
   7488         error_setg(errp, "Block driver '%s' does not support option amendment",
   7489                    bs->drv->format_name);
   7490         return -ENOTSUP;
   7491     }
   7492     return bs->drv->bdrv_amend_options(bs, opts, status_cb,
   7493                                        cb_opaque, force, errp);
   7494 }
   7495 
   7496 /*
   7497  * This function checks whether the given @to_replace is allowed to be
   7498  * replaced by a node that always shows the same data as @bs.  This is
   7499  * used for example to verify whether the mirror job can replace
   7500  * @to_replace by the target mirrored from @bs.
   7501  * To be replaceable, @bs and @to_replace may either be guaranteed to
   7502  * always show the same data (because they are only connected through
   7503  * filters), or some driver may allow replacing one of its children
   7504  * because it can guarantee that this child's data is not visible at
   7505  * all (for example, for dissenting quorum children that have no other
   7506  * parents).
   7507  */
   7508 bool bdrv_recurse_can_replace(BlockDriverState *bs,
   7509                               BlockDriverState *to_replace)
   7510 {
   7511     BlockDriverState *filtered;
   7512 
   7513     GLOBAL_STATE_CODE();
   7514 
   7515     if (!bs || !bs->drv) {
   7516         return false;
   7517     }
   7518 
   7519     if (bs == to_replace) {
   7520         return true;
   7521     }
   7522 
   7523     /* See what the driver can do */
   7524     if (bs->drv->bdrv_recurse_can_replace) {
   7525         return bs->drv->bdrv_recurse_can_replace(bs, to_replace);
   7526     }
   7527 
   7528     /* For filters without an own implementation, we can recurse on our own */
   7529     filtered = bdrv_filter_bs(bs);
   7530     if (filtered) {
   7531         return bdrv_recurse_can_replace(filtered, to_replace);
   7532     }
   7533 
   7534     /* Safe default */
   7535     return false;
   7536 }
   7537 
   7538 /*
   7539  * Check whether the given @node_name can be replaced by a node that
   7540  * has the same data as @parent_bs.  If so, return @node_name's BDS;
   7541  * NULL otherwise.
   7542  *
   7543  * @node_name must be a (recursive) *child of @parent_bs (or this
   7544  * function will return NULL).
   7545  *
   7546  * The result (whether the node can be replaced or not) is only valid
   7547  * for as long as no graph or permission changes occur.
   7548  */
   7549 BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
   7550                                         const char *node_name, Error **errp)
   7551 {
   7552     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
   7553     AioContext *aio_context;
   7554 
   7555     GLOBAL_STATE_CODE();
   7556 
   7557     if (!to_replace_bs) {
   7558         error_setg(errp, "Failed to find node with node-name='%s'", node_name);
   7559         return NULL;
   7560     }
   7561 
   7562     aio_context = bdrv_get_aio_context(to_replace_bs);
   7563     aio_context_acquire(aio_context);
   7564 
   7565     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
   7566         to_replace_bs = NULL;
   7567         goto out;
   7568     }
   7569 
   7570     /* We don't want arbitrary node of the BDS chain to be replaced only the top
   7571      * most non filter in order to prevent data corruption.
   7572      * Another benefit is that this tests exclude backing files which are
   7573      * blocked by the backing blockers.
   7574      */
   7575     if (!bdrv_recurse_can_replace(parent_bs, to_replace_bs)) {
   7576         error_setg(errp, "Cannot replace '%s' by a node mirrored from '%s', "
   7577                    "because it cannot be guaranteed that doing so would not "
   7578                    "lead to an abrupt change of visible data",
   7579                    node_name, parent_bs->node_name);
   7580         to_replace_bs = NULL;
   7581         goto out;
   7582     }
   7583 
   7584 out:
   7585     aio_context_release(aio_context);
   7586     return to_replace_bs;
   7587 }
   7588 
   7589 /**
   7590  * Iterates through the list of runtime option keys that are said to
   7591  * be "strong" for a BDS.  An option is called "strong" if it changes
   7592  * a BDS's data.  For example, the null block driver's "size" and
   7593  * "read-zeroes" options are strong, but its "latency-ns" option is
   7594  * not.
   7595  *
   7596  * If a key returned by this function ends with a dot, all options
   7597  * starting with that prefix are strong.
   7598  */
   7599 static const char *const *strong_options(BlockDriverState *bs,
   7600                                          const char *const *curopt)
   7601 {
   7602     static const char *const global_options[] = {
   7603         "driver", "filename", NULL
   7604     };
   7605 
   7606     if (!curopt) {
   7607         return &global_options[0];
   7608     }
   7609 
   7610     curopt++;
   7611     if (curopt == &global_options[ARRAY_SIZE(global_options) - 1] && bs->drv) {
   7612         curopt = bs->drv->strong_runtime_opts;
   7613     }
   7614 
   7615     return (curopt && *curopt) ? curopt : NULL;
   7616 }
   7617 
   7618 /**
   7619  * Copies all strong runtime options from bs->options to the given
   7620  * QDict.  The set of strong option keys is determined by invoking
   7621  * strong_options().
   7622  *
   7623  * Returns true iff any strong option was present in bs->options (and
   7624  * thus copied to the target QDict) with the exception of "filename"
   7625  * and "driver".  The caller is expected to use this value to decide
   7626  * whether the existence of strong options prevents the generation of
   7627  * a plain filename.
   7628  */
   7629 static bool append_strong_runtime_options(QDict *d, BlockDriverState *bs)
   7630 {
   7631     bool found_any = false;
   7632     const char *const *option_name = NULL;
   7633 
   7634     if (!bs->drv) {
   7635         return false;
   7636     }
   7637 
   7638     while ((option_name = strong_options(bs, option_name))) {
   7639         bool option_given = false;
   7640 
   7641         assert(strlen(*option_name) > 0);
   7642         if ((*option_name)[strlen(*option_name) - 1] != '.') {
   7643             QObject *entry = qdict_get(bs->options, *option_name);
   7644             if (!entry) {
   7645                 continue;
   7646             }
   7647 
   7648             qdict_put_obj(d, *option_name, qobject_ref(entry));
   7649             option_given = true;
   7650         } else {
   7651             const QDictEntry *entry;
   7652             for (entry = qdict_first(bs->options); entry;
   7653                  entry = qdict_next(bs->options, entry))
   7654             {
   7655                 if (strstart(qdict_entry_key(entry), *option_name, NULL)) {
   7656                     qdict_put_obj(d, qdict_entry_key(entry),
   7657                                   qobject_ref(qdict_entry_value(entry)));
   7658                     option_given = true;
   7659                 }
   7660             }
   7661         }
   7662 
   7663         /* While "driver" and "filename" need to be included in a JSON filename,
   7664          * their existence does not prohibit generation of a plain filename. */
   7665         if (!found_any && option_given &&
   7666             strcmp(*option_name, "driver") && strcmp(*option_name, "filename"))
   7667         {
   7668             found_any = true;
   7669         }
   7670     }
   7671 
   7672     if (!qdict_haskey(d, "driver")) {
   7673         /* Drivers created with bdrv_new_open_driver() may not have a
   7674          * @driver option.  Add it here. */
   7675         qdict_put_str(d, "driver", bs->drv->format_name);
   7676     }
   7677 
   7678     return found_any;
   7679 }
   7680 
   7681 /* Note: This function may return false positives; it may return true
   7682  * even if opening the backing file specified by bs's image header
   7683  * would result in exactly bs->backing. */
   7684 static bool bdrv_backing_overridden(BlockDriverState *bs)
   7685 {
   7686     GLOBAL_STATE_CODE();
   7687     if (bs->backing) {
   7688         return strcmp(bs->auto_backing_file,
   7689                       bs->backing->bs->filename);
   7690     } else {
   7691         /* No backing BDS, so if the image header reports any backing
   7692          * file, it must have been suppressed */
   7693         return bs->auto_backing_file[0] != '\0';
   7694     }
   7695 }
   7696 
   7697 /* Updates the following BDS fields:
   7698  *  - exact_filename: A filename which may be used for opening a block device
   7699  *                    which (mostly) equals the given BDS (even without any
   7700  *                    other options; so reading and writing must return the same
   7701  *                    results, but caching etc. may be different)
   7702  *  - full_open_options: Options which, when given when opening a block device
   7703  *                       (without a filename), result in a BDS (mostly)
   7704  *                       equalling the given one
   7705  *  - filename: If exact_filename is set, it is copied here. Otherwise,
   7706  *              full_open_options is converted to a JSON object, prefixed with
   7707  *              "json:" (for use through the JSON pseudo protocol) and put here.
   7708  */
   7709 void bdrv_refresh_filename(BlockDriverState *bs)
   7710 {
   7711     BlockDriver *drv = bs->drv;
   7712     BdrvChild *child;
   7713     BlockDriverState *primary_child_bs;
   7714     QDict *opts;
   7715     bool backing_overridden;
   7716     bool generate_json_filename; /* Whether our default implementation should
   7717                                     fill exact_filename (false) or not (true) */
   7718 
   7719     GLOBAL_STATE_CODE();
   7720 
   7721     if (!drv) {
   7722         return;
   7723     }
   7724 
   7725     /* This BDS's file name may depend on any of its children's file names, so
   7726      * refresh those first */
   7727     QLIST_FOREACH(child, &bs->children, next) {
   7728         bdrv_refresh_filename(child->bs);
   7729     }
   7730 
   7731     if (bs->implicit) {
   7732         /* For implicit nodes, just copy everything from the single child */
   7733         child = QLIST_FIRST(&bs->children);
   7734         assert(QLIST_NEXT(child, next) == NULL);
   7735 
   7736         pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
   7737                 child->bs->exact_filename);
   7738         pstrcpy(bs->filename, sizeof(bs->filename), child->bs->filename);
   7739 
   7740         qobject_unref(bs->full_open_options);
   7741         bs->full_open_options = qobject_ref(child->bs->full_open_options);
   7742 
   7743         return;
   7744     }
   7745 
   7746     backing_overridden = bdrv_backing_overridden(bs);
   7747 
   7748     if (bs->open_flags & BDRV_O_NO_IO) {
   7749         /* Without I/O, the backing file does not change anything.
   7750          * Therefore, in such a case (primarily qemu-img), we can
   7751          * pretend the backing file has not been overridden even if
   7752          * it technically has been. */
   7753         backing_overridden = false;
   7754     }
   7755 
   7756     /* Gather the options QDict */
   7757     opts = qdict_new();
   7758     generate_json_filename = append_strong_runtime_options(opts, bs);
   7759     generate_json_filename |= backing_overridden;
   7760 
   7761     if (drv->bdrv_gather_child_options) {
   7762         /* Some block drivers may not want to present all of their children's
   7763          * options, or name them differently from BdrvChild.name */
   7764         drv->bdrv_gather_child_options(bs, opts, backing_overridden);
   7765     } else {
   7766         QLIST_FOREACH(child, &bs->children, next) {
   7767             if (child == bs->backing && !backing_overridden) {
   7768                 /* We can skip the backing BDS if it has not been overridden */
   7769                 continue;
   7770             }
   7771 
   7772             qdict_put(opts, child->name,
   7773                       qobject_ref(child->bs->full_open_options));
   7774         }
   7775 
   7776         if (backing_overridden && !bs->backing) {
   7777             /* Force no backing file */
   7778             qdict_put_null(opts, "backing");
   7779         }
   7780     }
   7781 
   7782     qobject_unref(bs->full_open_options);
   7783     bs->full_open_options = opts;
   7784 
   7785     primary_child_bs = bdrv_primary_bs(bs);
   7786 
   7787     if (drv->bdrv_refresh_filename) {
   7788         /* Obsolete information is of no use here, so drop the old file name
   7789          * information before refreshing it */
   7790         bs->exact_filename[0] = '\0';
   7791 
   7792         drv->bdrv_refresh_filename(bs);
   7793     } else if (primary_child_bs) {
   7794         /*
   7795          * Try to reconstruct valid information from the underlying
   7796          * file -- this only works for format nodes (filter nodes
   7797          * cannot be probed and as such must be selected by the user
   7798          * either through an options dict, or through a special
   7799          * filename which the filter driver must construct in its
   7800          * .bdrv_refresh_filename() implementation).
   7801          */
   7802 
   7803         bs->exact_filename[0] = '\0';
   7804 
   7805         /*
   7806          * We can use the underlying file's filename if:
   7807          * - it has a filename,
   7808          * - the current BDS is not a filter,
   7809          * - the file is a protocol BDS, and
   7810          * - opening that file (as this BDS's format) will automatically create
   7811          *   the BDS tree we have right now, that is:
   7812          *   - the user did not significantly change this BDS's behavior with
   7813          *     some explicit (strong) options
   7814          *   - no non-file child of this BDS has been overridden by the user
   7815          *   Both of these conditions are represented by generate_json_filename.
   7816          */
   7817         if (primary_child_bs->exact_filename[0] &&
   7818             primary_child_bs->drv->bdrv_file_open &&
   7819             !drv->is_filter && !generate_json_filename)
   7820         {
   7821             strcpy(bs->exact_filename, primary_child_bs->exact_filename);
   7822         }
   7823     }
   7824 
   7825     if (bs->exact_filename[0]) {
   7826         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
   7827     } else {
   7828         GString *json = qobject_to_json(QOBJECT(bs->full_open_options));
   7829         if (snprintf(bs->filename, sizeof(bs->filename), "json:%s",
   7830                      json->str) >= sizeof(bs->filename)) {
   7831             /* Give user a hint if we truncated things. */
   7832             strcpy(bs->filename + sizeof(bs->filename) - 4, "...");
   7833         }
   7834         g_string_free(json, true);
   7835     }
   7836 }
   7837 
   7838 char *bdrv_dirname(BlockDriverState *bs, Error **errp)
   7839 {
   7840     BlockDriver *drv = bs->drv;
   7841     BlockDriverState *child_bs;
   7842 
   7843     GLOBAL_STATE_CODE();
   7844 
   7845     if (!drv) {
   7846         error_setg(errp, "Node '%s' is ejected", bs->node_name);
   7847         return NULL;
   7848     }
   7849 
   7850     if (drv->bdrv_dirname) {
   7851         return drv->bdrv_dirname(bs, errp);
   7852     }
   7853 
   7854     child_bs = bdrv_primary_bs(bs);
   7855     if (child_bs) {
   7856         return bdrv_dirname(child_bs, errp);
   7857     }
   7858 
   7859     bdrv_refresh_filename(bs);
   7860     if (bs->exact_filename[0] != '\0') {
   7861         return path_combine(bs->exact_filename, "");
   7862     }
   7863 
   7864     error_setg(errp, "Cannot generate a base directory for %s nodes",
   7865                drv->format_name);
   7866     return NULL;
   7867 }
   7868 
   7869 /*
   7870  * Hot add/remove a BDS's child. So the user can take a child offline when
   7871  * it is broken and take a new child online
   7872  */
   7873 void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
   7874                     Error **errp)
   7875 {
   7876     GLOBAL_STATE_CODE();
   7877     if (!parent_bs->drv || !parent_bs->drv->bdrv_add_child) {
   7878         error_setg(errp, "The node %s does not support adding a child",
   7879                    bdrv_get_device_or_node_name(parent_bs));
   7880         return;
   7881     }
   7882 
   7883     if (!QLIST_EMPTY(&child_bs->parents)) {
   7884         error_setg(errp, "The node %s already has a parent",
   7885                    child_bs->node_name);
   7886         return;
   7887     }
   7888 
   7889     parent_bs->drv->bdrv_add_child(parent_bs, child_bs, errp);
   7890 }
   7891 
   7892 void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp)
   7893 {
   7894     BdrvChild *tmp;
   7895 
   7896     GLOBAL_STATE_CODE();
   7897     if (!parent_bs->drv || !parent_bs->drv->bdrv_del_child) {
   7898         error_setg(errp, "The node %s does not support removing a child",
   7899                    bdrv_get_device_or_node_name(parent_bs));
   7900         return;
   7901     }
   7902 
   7903     QLIST_FOREACH(tmp, &parent_bs->children, next) {
   7904         if (tmp == child) {
   7905             break;
   7906         }
   7907     }
   7908 
   7909     if (!tmp) {
   7910         error_setg(errp, "The node %s does not have a child named %s",
   7911                    bdrv_get_device_or_node_name(parent_bs),
   7912                    bdrv_get_device_or_node_name(child->bs));
   7913         return;
   7914     }
   7915 
   7916     parent_bs->drv->bdrv_del_child(parent_bs, child, errp);
   7917 }
   7918 
   7919 int bdrv_make_empty(BdrvChild *c, Error **errp)
   7920 {
   7921     BlockDriver *drv = c->bs->drv;
   7922     int ret;
   7923 
   7924     GLOBAL_STATE_CODE();
   7925     assert(c->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED));
   7926 
   7927     if (!drv->bdrv_make_empty) {
   7928         error_setg(errp, "%s does not support emptying nodes",
   7929                    drv->format_name);
   7930         return -ENOTSUP;
   7931     }
   7932 
   7933     ret = drv->bdrv_make_empty(c->bs);
   7934     if (ret < 0) {
   7935         error_setg_errno(errp, -ret, "Failed to empty %s",
   7936                          c->bs->filename);
   7937         return ret;
   7938     }
   7939 
   7940     return 0;
   7941 }
   7942 
   7943 /*
   7944  * Return the child that @bs acts as an overlay for, and from which data may be
   7945  * copied in COW or COR operations.  Usually this is the backing file.
   7946  */
   7947 BdrvChild *bdrv_cow_child(BlockDriverState *bs)
   7948 {
   7949     IO_CODE();
   7950 
   7951     if (!bs || !bs->drv) {
   7952         return NULL;
   7953     }
   7954 
   7955     if (bs->drv->is_filter) {
   7956         return NULL;
   7957     }
   7958 
   7959     if (!bs->backing) {
   7960         return NULL;
   7961     }
   7962 
   7963     assert(bs->backing->role & BDRV_CHILD_COW);
   7964     return bs->backing;
   7965 }
   7966 
   7967 /*
   7968  * If @bs acts as a filter for exactly one of its children, return
   7969  * that child.
   7970  */
   7971 BdrvChild *bdrv_filter_child(BlockDriverState *bs)
   7972 {
   7973     BdrvChild *c;
   7974     IO_CODE();
   7975 
   7976     if (!bs || !bs->drv) {
   7977         return NULL;
   7978     }
   7979 
   7980     if (!bs->drv->is_filter) {
   7981         return NULL;
   7982     }
   7983 
   7984     /* Only one of @backing or @file may be used */
   7985     assert(!(bs->backing && bs->file));
   7986 
   7987     c = bs->backing ?: bs->file;
   7988     if (!c) {
   7989         return NULL;
   7990     }
   7991 
   7992     assert(c->role & BDRV_CHILD_FILTERED);
   7993     return c;
   7994 }
   7995 
   7996 /*
   7997  * Return either the result of bdrv_cow_child() or bdrv_filter_child(),
   7998  * whichever is non-NULL.
   7999  *
   8000  * Return NULL if both are NULL.
   8001  */
   8002 BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs)
   8003 {
   8004     BdrvChild *cow_child = bdrv_cow_child(bs);
   8005     BdrvChild *filter_child = bdrv_filter_child(bs);
   8006     IO_CODE();
   8007 
   8008     /* Filter nodes cannot have COW backing files */
   8009     assert(!(cow_child && filter_child));
   8010 
   8011     return cow_child ?: filter_child;
   8012 }
   8013 
   8014 /*
   8015  * Return the primary child of this node: For filters, that is the
   8016  * filtered child.  For other nodes, that is usually the child storing
   8017  * metadata.
   8018  * (A generally more helpful description is that this is (usually) the
   8019  * child that has the same filename as @bs.)
   8020  *
   8021  * Drivers do not necessarily have a primary child; for example quorum
   8022  * does not.
   8023  */
   8024 BdrvChild *bdrv_primary_child(BlockDriverState *bs)
   8025 {
   8026     BdrvChild *c, *found = NULL;
   8027     IO_CODE();
   8028 
   8029     QLIST_FOREACH(c, &bs->children, next) {
   8030         if (c->role & BDRV_CHILD_PRIMARY) {
   8031             assert(!found);
   8032             found = c;
   8033         }
   8034     }
   8035 
   8036     return found;
   8037 }
   8038 
   8039 static BlockDriverState *bdrv_do_skip_filters(BlockDriverState *bs,
   8040                                               bool stop_on_explicit_filter)
   8041 {
   8042     BdrvChild *c;
   8043 
   8044     if (!bs) {
   8045         return NULL;
   8046     }
   8047 
   8048     while (!(stop_on_explicit_filter && !bs->implicit)) {
   8049         c = bdrv_filter_child(bs);
   8050         if (!c) {
   8051             /*
   8052              * A filter that is embedded in a working block graph must
   8053              * have a child.  Assert this here so this function does
   8054              * not return a filter node that is not expected by the
   8055              * caller.
   8056              */
   8057             assert(!bs->drv || !bs->drv->is_filter);
   8058             break;
   8059         }
   8060         bs = c->bs;
   8061     }
   8062     /*
   8063      * Note that this treats nodes with bs->drv == NULL as not being
   8064      * filters (bs->drv == NULL should be replaced by something else
   8065      * anyway).
   8066      * The advantage of this behavior is that this function will thus
   8067      * always return a non-NULL value (given a non-NULL @bs).
   8068      */
   8069 
   8070     return bs;
   8071 }
   8072 
   8073 /*
   8074  * Return the first BDS that has not been added implicitly or that
   8075  * does not have a filtered child down the chain starting from @bs
   8076  * (including @bs itself).
   8077  */
   8078 BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs)
   8079 {
   8080     GLOBAL_STATE_CODE();
   8081     return bdrv_do_skip_filters(bs, true);
   8082 }
   8083 
   8084 /*
   8085  * Return the first BDS that does not have a filtered child down the
   8086  * chain starting from @bs (including @bs itself).
   8087  */
   8088 BlockDriverState *bdrv_skip_filters(BlockDriverState *bs)
   8089 {
   8090     IO_CODE();
   8091     return bdrv_do_skip_filters(bs, false);
   8092 }
   8093 
   8094 /*
   8095  * For a backing chain, return the first non-filter backing image of
   8096  * the first non-filter image.
   8097  */
   8098 BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
   8099 {
   8100     IO_CODE();
   8101     return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs)));
   8102 }
   8103 
   8104 /**
   8105  * Check whether [offset, offset + bytes) overlaps with the cached
   8106  * block-status data region.
   8107  *
   8108  * If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`,
   8109  * which is what bdrv_bsc_is_data()'s interface needs.
   8110  * Otherwise, *pnum is not touched.
   8111  */
   8112 static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs,
   8113                                            int64_t offset, int64_t bytes,
   8114                                            int64_t *pnum)
   8115 {
   8116     BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache);
   8117     bool overlaps;
   8118 
   8119     overlaps =
   8120         qatomic_read(&bsc->valid) &&
   8121         ranges_overlap(offset, bytes, bsc->data_start,
   8122                        bsc->data_end - bsc->data_start);
   8123 
   8124     if (overlaps && pnum) {
   8125         *pnum = bsc->data_end - offset;
   8126     }
   8127 
   8128     return overlaps;
   8129 }
   8130 
   8131 /**
   8132  * See block_int.h for this function's documentation.
   8133  */
   8134 bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum)
   8135 {
   8136     IO_CODE();
   8137     RCU_READ_LOCK_GUARD();
   8138     return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum);
   8139 }
   8140 
   8141 /**
   8142  * See block_int.h for this function's documentation.
   8143  */
   8144 void bdrv_bsc_invalidate_range(BlockDriverState *bs,
   8145                                int64_t offset, int64_t bytes)
   8146 {
   8147     IO_CODE();
   8148     RCU_READ_LOCK_GUARD();
   8149 
   8150     if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) {
   8151         qatomic_set(&bs->block_status_cache->valid, false);
   8152     }
   8153 }
   8154 
   8155 /**
   8156  * See block_int.h for this function's documentation.
   8157  */
   8158 void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes)
   8159 {
   8160     BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1);
   8161     BdrvBlockStatusCache *old_bsc;
   8162     IO_CODE();
   8163 
   8164     *new_bsc = (BdrvBlockStatusCache) {
   8165         .valid = true,
   8166         .data_start = offset,
   8167         .data_end = offset + bytes,
   8168     };
   8169 
   8170     QEMU_LOCK_GUARD(&bs->bsc_modify_lock);
   8171 
   8172     old_bsc = qatomic_rcu_read(&bs->block_status_cache);
   8173     qatomic_rcu_set(&bs->block_status_cache, new_bsc);
   8174     if (old_bsc) {
   8175         g_free_rcu(old_bsc, rcu);
   8176     }
   8177 }