qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

qcow2.c (210153B)


      1 /*
      2  * Block driver for the QCOW version 2 format
      3  *
      4  * Copyright (c) 2004-2006 Fabrice Bellard
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a copy
      7  * of this software and associated documentation files (the "Software"), to deal
      8  * in the Software without restriction, including without limitation the rights
      9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the Software is
     11  * furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22  * THE SOFTWARE.
     23  */
     24 
     25 #include "qemu/osdep.h"
     26 
     27 #include "block/qdict.h"
     28 #include "sysemu/block-backend.h"
     29 #include "qemu/main-loop.h"
     30 #include "qemu/module.h"
     31 #include "qcow2.h"
     32 #include "qemu/error-report.h"
     33 #include "qapi/error.h"
     34 #include "qapi/qapi-events-block-core.h"
     35 #include "qapi/qmp/qdict.h"
     36 #include "qapi/qmp/qstring.h"
     37 #include "trace.h"
     38 #include "qemu/option_int.h"
     39 #include "qemu/cutils.h"
     40 #include "qemu/bswap.h"
     41 #include "qemu/memalign.h"
     42 #include "qapi/qobject-input-visitor.h"
     43 #include "qapi/qapi-visit-block-core.h"
     44 #include "crypto.h"
     45 #include "block/aio_task.h"
     46 
     47 /*
     48   Differences with QCOW:
     49 
     50   - Support for multiple incremental snapshots.
     51   - Memory management by reference counts.
     52   - Clusters which have a reference count of one have the bit
     53     QCOW_OFLAG_COPIED to optimize write performance.
     54   - Size of compressed clusters is stored in sectors to reduce bit usage
     55     in the cluster offsets.
     56   - Support for storing additional data (such as the VM state) in the
     57     snapshots.
     58   - If a backing store is used, the cluster size is not constrained
     59     (could be backported to QCOW).
     60   - L2 tables have always a size of one cluster.
     61 */
     62 
     63 
     64 typedef struct {
     65     uint32_t magic;
     66     uint32_t len;
     67 } QEMU_PACKED QCowExtension;
     68 
     69 #define  QCOW2_EXT_MAGIC_END 0
     70 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xe2792aca
     71 #define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
     72 #define  QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77
     73 #define  QCOW2_EXT_MAGIC_BITMAPS 0x23852875
     74 #define  QCOW2_EXT_MAGIC_DATA_FILE 0x44415441
     75 
     76 static int coroutine_fn
     77 qcow2_co_preadv_compressed(BlockDriverState *bs,
     78                            uint64_t l2_entry,
     79                            uint64_t offset,
     80                            uint64_t bytes,
     81                            QEMUIOVector *qiov,
     82                            size_t qiov_offset);
     83 
     84 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
     85 {
     86     const QCowHeader *cow_header = (const void *)buf;
     87 
     88     if (buf_size >= sizeof(QCowHeader) &&
     89         be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
     90         be32_to_cpu(cow_header->version) >= 2)
     91         return 100;
     92     else
     93         return 0;
     94 }
     95 
     96 
     97 static int qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset,
     98                                       uint8_t *buf, size_t buflen,
     99                                       void *opaque, Error **errp)
    100 {
    101     BlockDriverState *bs = opaque;
    102     BDRVQcow2State *s = bs->opaque;
    103     ssize_t ret;
    104 
    105     if ((offset + buflen) > s->crypto_header.length) {
    106         error_setg(errp, "Request for data outside of extension header");
    107         return -1;
    108     }
    109 
    110     ret = bdrv_pread(bs->file, s->crypto_header.offset + offset, buflen, buf,
    111                      0);
    112     if (ret < 0) {
    113         error_setg_errno(errp, -ret, "Could not read encryption header");
    114         return -1;
    115     }
    116     return 0;
    117 }
    118 
    119 
    120 static int qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen,
    121                                       void *opaque, Error **errp)
    122 {
    123     BlockDriverState *bs = opaque;
    124     BDRVQcow2State *s = bs->opaque;
    125     int64_t ret;
    126     int64_t clusterlen;
    127 
    128     ret = qcow2_alloc_clusters(bs, headerlen);
    129     if (ret < 0) {
    130         error_setg_errno(errp, -ret,
    131                          "Cannot allocate cluster for LUKS header size %zu",
    132                          headerlen);
    133         return -1;
    134     }
    135 
    136     s->crypto_header.length = headerlen;
    137     s->crypto_header.offset = ret;
    138 
    139     /*
    140      * Zero fill all space in cluster so it has predictable
    141      * content, as we may not initialize some regions of the
    142      * header (eg only 1 out of 8 key slots will be initialized)
    143      */
    144     clusterlen = size_to_clusters(s, headerlen) * s->cluster_size;
    145     assert(qcow2_pre_write_overlap_check(bs, 0, ret, clusterlen, false) == 0);
    146     ret = bdrv_pwrite_zeroes(bs->file,
    147                              ret,
    148                              clusterlen, 0);
    149     if (ret < 0) {
    150         error_setg_errno(errp, -ret, "Could not zero fill encryption header");
    151         return -1;
    152     }
    153 
    154     return 0;
    155 }
    156 
    157 
    158 static int qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset,
    159                                        const uint8_t *buf, size_t buflen,
    160                                        void *opaque, Error **errp)
    161 {
    162     BlockDriverState *bs = opaque;
    163     BDRVQcow2State *s = bs->opaque;
    164     ssize_t ret;
    165 
    166     if ((offset + buflen) > s->crypto_header.length) {
    167         error_setg(errp, "Request for data outside of extension header");
    168         return -1;
    169     }
    170 
    171     ret = bdrv_pwrite(bs->file, s->crypto_header.offset + offset, buflen, buf,
    172                       0);
    173     if (ret < 0) {
    174         error_setg_errno(errp, -ret, "Could not read encryption header");
    175         return -1;
    176     }
    177     return 0;
    178 }
    179 
    180 static QDict*
    181 qcow2_extract_crypto_opts(QemuOpts *opts, const char *fmt, Error **errp)
    182 {
    183     QDict *cryptoopts_qdict;
    184     QDict *opts_qdict;
    185 
    186     /* Extract "encrypt." options into a qdict */
    187     opts_qdict = qemu_opts_to_qdict(opts, NULL);
    188     qdict_extract_subqdict(opts_qdict, &cryptoopts_qdict, "encrypt.");
    189     qobject_unref(opts_qdict);
    190     qdict_put_str(cryptoopts_qdict, "format", fmt);
    191     return cryptoopts_qdict;
    192 }
    193 
    194 /*
    195  * read qcow2 extension and fill bs
    196  * start reading from start_offset
    197  * finish reading upon magic of value 0 or when end_offset reached
    198  * unknown magic is skipped (future extension this version knows nothing about)
    199  * return 0 upon success, non-0 otherwise
    200  */
    201 static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
    202                                  uint64_t end_offset, void **p_feature_table,
    203                                  int flags, bool *need_update_header,
    204                                  Error **errp)
    205 {
    206     BDRVQcow2State *s = bs->opaque;
    207     QCowExtension ext;
    208     uint64_t offset;
    209     int ret;
    210     Qcow2BitmapHeaderExt bitmaps_ext;
    211 
    212     if (need_update_header != NULL) {
    213         *need_update_header = false;
    214     }
    215 
    216 #ifdef DEBUG_EXT
    217     printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
    218 #endif
    219     offset = start_offset;
    220     while (offset < end_offset) {
    221 
    222 #ifdef DEBUG_EXT
    223         /* Sanity check */
    224         if (offset > s->cluster_size)
    225             printf("qcow2_read_extension: suspicious offset %lu\n", offset);
    226 
    227         printf("attempting to read extended header in offset %lu\n", offset);
    228 #endif
    229 
    230         ret = bdrv_pread(bs->file, offset, sizeof(ext), &ext, 0);
    231         if (ret < 0) {
    232             error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
    233                              "pread fail from offset %" PRIu64, offset);
    234             return 1;
    235         }
    236         ext.magic = be32_to_cpu(ext.magic);
    237         ext.len = be32_to_cpu(ext.len);
    238         offset += sizeof(ext);
    239 #ifdef DEBUG_EXT
    240         printf("ext.magic = 0x%x\n", ext.magic);
    241 #endif
    242         if (offset > end_offset || ext.len > end_offset - offset) {
    243             error_setg(errp, "Header extension too large");
    244             return -EINVAL;
    245         }
    246 
    247         switch (ext.magic) {
    248         case QCOW2_EXT_MAGIC_END:
    249             return 0;
    250 
    251         case QCOW2_EXT_MAGIC_BACKING_FORMAT:
    252             if (ext.len >= sizeof(bs->backing_format)) {
    253                 error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32
    254                            " too large (>=%zu)", ext.len,
    255                            sizeof(bs->backing_format));
    256                 return 2;
    257             }
    258             ret = bdrv_pread(bs->file, offset, ext.len, bs->backing_format, 0);
    259             if (ret < 0) {
    260                 error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
    261                                  "Could not read format name");
    262                 return 3;
    263             }
    264             bs->backing_format[ext.len] = '\0';
    265             s->image_backing_format = g_strdup(bs->backing_format);
    266 #ifdef DEBUG_EXT
    267             printf("Qcow2: Got format extension %s\n", bs->backing_format);
    268 #endif
    269             break;
    270 
    271         case QCOW2_EXT_MAGIC_FEATURE_TABLE:
    272             if (p_feature_table != NULL) {
    273                 void *feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
    274                 ret = bdrv_pread(bs->file, offset, ext.len, feature_table, 0);
    275                 if (ret < 0) {
    276                     error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
    277                                      "Could not read table");
    278                     g_free(feature_table);
    279                     return ret;
    280                 }
    281 
    282                 *p_feature_table = feature_table;
    283             }
    284             break;
    285 
    286         case QCOW2_EXT_MAGIC_CRYPTO_HEADER: {
    287             unsigned int cflags = 0;
    288             if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
    289                 error_setg(errp, "CRYPTO header extension only "
    290                            "expected with LUKS encryption method");
    291                 return -EINVAL;
    292             }
    293             if (ext.len != sizeof(Qcow2CryptoHeaderExtension)) {
    294                 error_setg(errp, "CRYPTO header extension size %u, "
    295                            "but expected size %zu", ext.len,
    296                            sizeof(Qcow2CryptoHeaderExtension));
    297                 return -EINVAL;
    298             }
    299 
    300             ret = bdrv_pread(bs->file, offset, ext.len, &s->crypto_header, 0);
    301             if (ret < 0) {
    302                 error_setg_errno(errp, -ret,
    303                                  "Unable to read CRYPTO header extension");
    304                 return ret;
    305             }
    306             s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset);
    307             s->crypto_header.length = be64_to_cpu(s->crypto_header.length);
    308 
    309             if ((s->crypto_header.offset % s->cluster_size) != 0) {
    310                 error_setg(errp, "Encryption header offset '%" PRIu64 "' is "
    311                            "not a multiple of cluster size '%u'",
    312                            s->crypto_header.offset, s->cluster_size);
    313                 return -EINVAL;
    314             }
    315 
    316             if (flags & BDRV_O_NO_IO) {
    317                 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
    318             }
    319             s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
    320                                            qcow2_crypto_hdr_read_func,
    321                                            bs, cflags, QCOW2_MAX_THREADS, errp);
    322             if (!s->crypto) {
    323                 return -EINVAL;
    324             }
    325         }   break;
    326 
    327         case QCOW2_EXT_MAGIC_BITMAPS:
    328             if (ext.len != sizeof(bitmaps_ext)) {
    329                 error_setg_errno(errp, -ret, "bitmaps_ext: "
    330                                  "Invalid extension length");
    331                 return -EINVAL;
    332             }
    333 
    334             if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) {
    335                 if (s->qcow_version < 3) {
    336                     /* Let's be a bit more specific */
    337                     warn_report("This qcow2 v2 image contains bitmaps, but "
    338                                 "they may have been modified by a program "
    339                                 "without persistent bitmap support; so now "
    340                                 "they must all be considered inconsistent");
    341                 } else {
    342                     warn_report("a program lacking bitmap support "
    343                                 "modified this file, so all bitmaps are now "
    344                                 "considered inconsistent");
    345                 }
    346                 error_printf("Some clusters may be leaked, "
    347                              "run 'qemu-img check -r' on the image "
    348                              "file to fix.");
    349                 if (need_update_header != NULL) {
    350                     /* Updating is needed to drop invalid bitmap extension. */
    351                     *need_update_header = true;
    352                 }
    353                 break;
    354             }
    355 
    356             ret = bdrv_pread(bs->file, offset, ext.len, &bitmaps_ext, 0);
    357             if (ret < 0) {
    358                 error_setg_errno(errp, -ret, "bitmaps_ext: "
    359                                  "Could not read ext header");
    360                 return ret;
    361             }
    362 
    363             if (bitmaps_ext.reserved32 != 0) {
    364                 error_setg_errno(errp, -ret, "bitmaps_ext: "
    365                                  "Reserved field is not zero");
    366                 return -EINVAL;
    367             }
    368 
    369             bitmaps_ext.nb_bitmaps = be32_to_cpu(bitmaps_ext.nb_bitmaps);
    370             bitmaps_ext.bitmap_directory_size =
    371                 be64_to_cpu(bitmaps_ext.bitmap_directory_size);
    372             bitmaps_ext.bitmap_directory_offset =
    373                 be64_to_cpu(bitmaps_ext.bitmap_directory_offset);
    374 
    375             if (bitmaps_ext.nb_bitmaps > QCOW2_MAX_BITMAPS) {
    376                 error_setg(errp,
    377                            "bitmaps_ext: Image has %" PRIu32 " bitmaps, "
    378                            "exceeding the QEMU supported maximum of %d",
    379                            bitmaps_ext.nb_bitmaps, QCOW2_MAX_BITMAPS);
    380                 return -EINVAL;
    381             }
    382 
    383             if (bitmaps_ext.nb_bitmaps == 0) {
    384                 error_setg(errp, "found bitmaps extension with zero bitmaps");
    385                 return -EINVAL;
    386             }
    387 
    388             if (offset_into_cluster(s, bitmaps_ext.bitmap_directory_offset)) {
    389                 error_setg(errp, "bitmaps_ext: "
    390                                  "invalid bitmap directory offset");
    391                 return -EINVAL;
    392             }
    393 
    394             if (bitmaps_ext.bitmap_directory_size >
    395                 QCOW2_MAX_BITMAP_DIRECTORY_SIZE) {
    396                 error_setg(errp, "bitmaps_ext: "
    397                                  "bitmap directory size (%" PRIu64 ") exceeds "
    398                                  "the maximum supported size (%d)",
    399                                  bitmaps_ext.bitmap_directory_size,
    400                                  QCOW2_MAX_BITMAP_DIRECTORY_SIZE);
    401                 return -EINVAL;
    402             }
    403 
    404             s->nb_bitmaps = bitmaps_ext.nb_bitmaps;
    405             s->bitmap_directory_offset =
    406                     bitmaps_ext.bitmap_directory_offset;
    407             s->bitmap_directory_size =
    408                     bitmaps_ext.bitmap_directory_size;
    409 
    410 #ifdef DEBUG_EXT
    411             printf("Qcow2: Got bitmaps extension: "
    412                    "offset=%" PRIu64 " nb_bitmaps=%" PRIu32 "\n",
    413                    s->bitmap_directory_offset, s->nb_bitmaps);
    414 #endif
    415             break;
    416 
    417         case QCOW2_EXT_MAGIC_DATA_FILE:
    418         {
    419             s->image_data_file = g_malloc0(ext.len + 1);
    420             ret = bdrv_pread(bs->file, offset, ext.len, s->image_data_file, 0);
    421             if (ret < 0) {
    422                 error_setg_errno(errp, -ret,
    423                                  "ERROR: Could not read data file name");
    424                 return ret;
    425             }
    426 #ifdef DEBUG_EXT
    427             printf("Qcow2: Got external data file %s\n", s->image_data_file);
    428 #endif
    429             break;
    430         }
    431 
    432         default:
    433             /* unknown magic - save it in case we need to rewrite the header */
    434             /* If you add a new feature, make sure to also update the fast
    435              * path of qcow2_make_empty() to deal with it. */
    436             {
    437                 Qcow2UnknownHeaderExtension *uext;
    438 
    439                 uext = g_malloc0(sizeof(*uext)  + ext.len);
    440                 uext->magic = ext.magic;
    441                 uext->len = ext.len;
    442                 QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
    443 
    444                 ret = bdrv_pread(bs->file, offset, uext->len, uext->data, 0);
    445                 if (ret < 0) {
    446                     error_setg_errno(errp, -ret, "ERROR: unknown extension: "
    447                                      "Could not read data");
    448                     return ret;
    449                 }
    450             }
    451             break;
    452         }
    453 
    454         offset += ((ext.len + 7) & ~7);
    455     }
    456 
    457     return 0;
    458 }
    459 
    460 static void cleanup_unknown_header_ext(BlockDriverState *bs)
    461 {
    462     BDRVQcow2State *s = bs->opaque;
    463     Qcow2UnknownHeaderExtension *uext, *next;
    464 
    465     QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
    466         QLIST_REMOVE(uext, next);
    467         g_free(uext);
    468     }
    469 }
    470 
    471 static void report_unsupported_feature(Error **errp, Qcow2Feature *table,
    472                                        uint64_t mask)
    473 {
    474     g_autoptr(GString) features = g_string_sized_new(60);
    475 
    476     while (table && table->name[0] != '\0') {
    477         if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
    478             if (mask & (1ULL << table->bit)) {
    479                 if (features->len > 0) {
    480                     g_string_append(features, ", ");
    481                 }
    482                 g_string_append_printf(features, "%.46s", table->name);
    483                 mask &= ~(1ULL << table->bit);
    484             }
    485         }
    486         table++;
    487     }
    488 
    489     if (mask) {
    490         if (features->len > 0) {
    491             g_string_append(features, ", ");
    492         }
    493         g_string_append_printf(features,
    494                                "Unknown incompatible feature: %" PRIx64, mask);
    495     }
    496 
    497     error_setg(errp, "Unsupported qcow2 feature(s): %s", features->str);
    498 }
    499 
    500 /*
    501  * Sets the dirty bit and flushes afterwards if necessary.
    502  *
    503  * The incompatible_features bit is only set if the image file header was
    504  * updated successfully.  Therefore it is not required to check the return
    505  * value of this function.
    506  */
    507 int qcow2_mark_dirty(BlockDriverState *bs)
    508 {
    509     BDRVQcow2State *s = bs->opaque;
    510     uint64_t val;
    511     int ret;
    512 
    513     assert(s->qcow_version >= 3);
    514 
    515     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
    516         return 0; /* already dirty */
    517     }
    518 
    519     val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
    520     ret = bdrv_pwrite_sync(bs->file,
    521                            offsetof(QCowHeader, incompatible_features),
    522                            sizeof(val), &val, 0);
    523     if (ret < 0) {
    524         return ret;
    525     }
    526 
    527     /* Only treat image as dirty if the header was updated successfully */
    528     s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
    529     return 0;
    530 }
    531 
    532 /*
    533  * Clears the dirty bit and flushes before if necessary.  Only call this
    534  * function when there are no pending requests, it does not guard against
    535  * concurrent requests dirtying the image.
    536  */
    537 static int qcow2_mark_clean(BlockDriverState *bs)
    538 {
    539     BDRVQcow2State *s = bs->opaque;
    540 
    541     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
    542         int ret;
    543 
    544         s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
    545 
    546         ret = qcow2_flush_caches(bs);
    547         if (ret < 0) {
    548             return ret;
    549         }
    550 
    551         return qcow2_update_header(bs);
    552     }
    553     return 0;
    554 }
    555 
    556 /*
    557  * Marks the image as corrupt.
    558  */
    559 int qcow2_mark_corrupt(BlockDriverState *bs)
    560 {
    561     BDRVQcow2State *s = bs->opaque;
    562 
    563     s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
    564     return qcow2_update_header(bs);
    565 }
    566 
    567 /*
    568  * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
    569  * before if necessary.
    570  */
    571 int qcow2_mark_consistent(BlockDriverState *bs)
    572 {
    573     BDRVQcow2State *s = bs->opaque;
    574 
    575     if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
    576         int ret = qcow2_flush_caches(bs);
    577         if (ret < 0) {
    578             return ret;
    579         }
    580 
    581         s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
    582         return qcow2_update_header(bs);
    583     }
    584     return 0;
    585 }
    586 
    587 static void qcow2_add_check_result(BdrvCheckResult *out,
    588                                    const BdrvCheckResult *src,
    589                                    bool set_allocation_info)
    590 {
    591     out->corruptions += src->corruptions;
    592     out->leaks += src->leaks;
    593     out->check_errors += src->check_errors;
    594     out->corruptions_fixed += src->corruptions_fixed;
    595     out->leaks_fixed += src->leaks_fixed;
    596 
    597     if (set_allocation_info) {
    598         out->image_end_offset = src->image_end_offset;
    599         out->bfi = src->bfi;
    600     }
    601 }
    602 
    603 static int coroutine_fn qcow2_co_check_locked(BlockDriverState *bs,
    604                                               BdrvCheckResult *result,
    605                                               BdrvCheckMode fix)
    606 {
    607     BdrvCheckResult snapshot_res = {};
    608     BdrvCheckResult refcount_res = {};
    609     int ret;
    610 
    611     memset(result, 0, sizeof(*result));
    612 
    613     ret = qcow2_check_read_snapshot_table(bs, &snapshot_res, fix);
    614     if (ret < 0) {
    615         qcow2_add_check_result(result, &snapshot_res, false);
    616         return ret;
    617     }
    618 
    619     ret = qcow2_check_refcounts(bs, &refcount_res, fix);
    620     qcow2_add_check_result(result, &refcount_res, true);
    621     if (ret < 0) {
    622         qcow2_add_check_result(result, &snapshot_res, false);
    623         return ret;
    624     }
    625 
    626     ret = qcow2_check_fix_snapshot_table(bs, &snapshot_res, fix);
    627     qcow2_add_check_result(result, &snapshot_res, false);
    628     if (ret < 0) {
    629         return ret;
    630     }
    631 
    632     if (fix && result->check_errors == 0 && result->corruptions == 0) {
    633         ret = qcow2_mark_clean(bs);
    634         if (ret < 0) {
    635             return ret;
    636         }
    637         return qcow2_mark_consistent(bs);
    638     }
    639     return ret;
    640 }
    641 
    642 static int coroutine_fn qcow2_co_check(BlockDriverState *bs,
    643                                        BdrvCheckResult *result,
    644                                        BdrvCheckMode fix)
    645 {
    646     BDRVQcow2State *s = bs->opaque;
    647     int ret;
    648 
    649     qemu_co_mutex_lock(&s->lock);
    650     ret = qcow2_co_check_locked(bs, result, fix);
    651     qemu_co_mutex_unlock(&s->lock);
    652     return ret;
    653 }
    654 
    655 int qcow2_validate_table(BlockDriverState *bs, uint64_t offset,
    656                          uint64_t entries, size_t entry_len,
    657                          int64_t max_size_bytes, const char *table_name,
    658                          Error **errp)
    659 {
    660     BDRVQcow2State *s = bs->opaque;
    661 
    662     if (entries > max_size_bytes / entry_len) {
    663         error_setg(errp, "%s too large", table_name);
    664         return -EFBIG;
    665     }
    666 
    667     /* Use signed INT64_MAX as the maximum even for uint64_t header fields,
    668      * because values will be passed to qemu functions taking int64_t. */
    669     if ((INT64_MAX - entries * entry_len < offset) ||
    670         (offset_into_cluster(s, offset) != 0)) {
    671         error_setg(errp, "%s offset invalid", table_name);
    672         return -EINVAL;
    673     }
    674 
    675     return 0;
    676 }
    677 
    678 static const char *const mutable_opts[] = {
    679     QCOW2_OPT_LAZY_REFCOUNTS,
    680     QCOW2_OPT_DISCARD_REQUEST,
    681     QCOW2_OPT_DISCARD_SNAPSHOT,
    682     QCOW2_OPT_DISCARD_OTHER,
    683     QCOW2_OPT_OVERLAP,
    684     QCOW2_OPT_OVERLAP_TEMPLATE,
    685     QCOW2_OPT_OVERLAP_MAIN_HEADER,
    686     QCOW2_OPT_OVERLAP_ACTIVE_L1,
    687     QCOW2_OPT_OVERLAP_ACTIVE_L2,
    688     QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
    689     QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
    690     QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
    691     QCOW2_OPT_OVERLAP_INACTIVE_L1,
    692     QCOW2_OPT_OVERLAP_INACTIVE_L2,
    693     QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
    694     QCOW2_OPT_CACHE_SIZE,
    695     QCOW2_OPT_L2_CACHE_SIZE,
    696     QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
    697     QCOW2_OPT_REFCOUNT_CACHE_SIZE,
    698     QCOW2_OPT_CACHE_CLEAN_INTERVAL,
    699     NULL
    700 };
    701 
    702 static QemuOptsList qcow2_runtime_opts = {
    703     .name = "qcow2",
    704     .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
    705     .desc = {
    706         {
    707             .name = QCOW2_OPT_LAZY_REFCOUNTS,
    708             .type = QEMU_OPT_BOOL,
    709             .help = "Postpone refcount updates",
    710         },
    711         {
    712             .name = QCOW2_OPT_DISCARD_REQUEST,
    713             .type = QEMU_OPT_BOOL,
    714             .help = "Pass guest discard requests to the layer below",
    715         },
    716         {
    717             .name = QCOW2_OPT_DISCARD_SNAPSHOT,
    718             .type = QEMU_OPT_BOOL,
    719             .help = "Generate discard requests when snapshot related space "
    720                     "is freed",
    721         },
    722         {
    723             .name = QCOW2_OPT_DISCARD_OTHER,
    724             .type = QEMU_OPT_BOOL,
    725             .help = "Generate discard requests when other clusters are freed",
    726         },
    727         {
    728             .name = QCOW2_OPT_OVERLAP,
    729             .type = QEMU_OPT_STRING,
    730             .help = "Selects which overlap checks to perform from a range of "
    731                     "templates (none, constant, cached, all)",
    732         },
    733         {
    734             .name = QCOW2_OPT_OVERLAP_TEMPLATE,
    735             .type = QEMU_OPT_STRING,
    736             .help = "Selects which overlap checks to perform from a range of "
    737                     "templates (none, constant, cached, all)",
    738         },
    739         {
    740             .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
    741             .type = QEMU_OPT_BOOL,
    742             .help = "Check for unintended writes into the main qcow2 header",
    743         },
    744         {
    745             .name = QCOW2_OPT_OVERLAP_ACTIVE_L1,
    746             .type = QEMU_OPT_BOOL,
    747             .help = "Check for unintended writes into the active L1 table",
    748         },
    749         {
    750             .name = QCOW2_OPT_OVERLAP_ACTIVE_L2,
    751             .type = QEMU_OPT_BOOL,
    752             .help = "Check for unintended writes into an active L2 table",
    753         },
    754         {
    755             .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
    756             .type = QEMU_OPT_BOOL,
    757             .help = "Check for unintended writes into the refcount table",
    758         },
    759         {
    760             .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
    761             .type = QEMU_OPT_BOOL,
    762             .help = "Check for unintended writes into a refcount block",
    763         },
    764         {
    765             .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
    766             .type = QEMU_OPT_BOOL,
    767             .help = "Check for unintended writes into the snapshot table",
    768         },
    769         {
    770             .name = QCOW2_OPT_OVERLAP_INACTIVE_L1,
    771             .type = QEMU_OPT_BOOL,
    772             .help = "Check for unintended writes into an inactive L1 table",
    773         },
    774         {
    775             .name = QCOW2_OPT_OVERLAP_INACTIVE_L2,
    776             .type = QEMU_OPT_BOOL,
    777             .help = "Check for unintended writes into an inactive L2 table",
    778         },
    779         {
    780             .name = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
    781             .type = QEMU_OPT_BOOL,
    782             .help = "Check for unintended writes into the bitmap directory",
    783         },
    784         {
    785             .name = QCOW2_OPT_CACHE_SIZE,
    786             .type = QEMU_OPT_SIZE,
    787             .help = "Maximum combined metadata (L2 tables and refcount blocks) "
    788                     "cache size",
    789         },
    790         {
    791             .name = QCOW2_OPT_L2_CACHE_SIZE,
    792             .type = QEMU_OPT_SIZE,
    793             .help = "Maximum L2 table cache size",
    794         },
    795         {
    796             .name = QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
    797             .type = QEMU_OPT_SIZE,
    798             .help = "Size of each entry in the L2 cache",
    799         },
    800         {
    801             .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
    802             .type = QEMU_OPT_SIZE,
    803             .help = "Maximum refcount block cache size",
    804         },
    805         {
    806             .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL,
    807             .type = QEMU_OPT_NUMBER,
    808             .help = "Clean unused cache entries after this time (in seconds)",
    809         },
    810         BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
    811             "ID of secret providing qcow2 AES key or LUKS passphrase"),
    812         { /* end of list */ }
    813     },
    814 };
    815 
    816 static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
    817     [QCOW2_OL_MAIN_HEADER_BITNR]      = QCOW2_OPT_OVERLAP_MAIN_HEADER,
    818     [QCOW2_OL_ACTIVE_L1_BITNR]        = QCOW2_OPT_OVERLAP_ACTIVE_L1,
    819     [QCOW2_OL_ACTIVE_L2_BITNR]        = QCOW2_OPT_OVERLAP_ACTIVE_L2,
    820     [QCOW2_OL_REFCOUNT_TABLE_BITNR]   = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
    821     [QCOW2_OL_REFCOUNT_BLOCK_BITNR]   = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
    822     [QCOW2_OL_SNAPSHOT_TABLE_BITNR]   = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
    823     [QCOW2_OL_INACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_INACTIVE_L1,
    824     [QCOW2_OL_INACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_INACTIVE_L2,
    825     [QCOW2_OL_BITMAP_DIRECTORY_BITNR] = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
    826 };
    827 
    828 static void cache_clean_timer_cb(void *opaque)
    829 {
    830     BlockDriverState *bs = opaque;
    831     BDRVQcow2State *s = bs->opaque;
    832     qcow2_cache_clean_unused(s->l2_table_cache);
    833     qcow2_cache_clean_unused(s->refcount_block_cache);
    834     timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
    835               (int64_t) s->cache_clean_interval * 1000);
    836 }
    837 
    838 static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context)
    839 {
    840     BDRVQcow2State *s = bs->opaque;
    841     if (s->cache_clean_interval > 0) {
    842         s->cache_clean_timer =
    843             aio_timer_new_with_attrs(context, QEMU_CLOCK_VIRTUAL,
    844                                      SCALE_MS, QEMU_TIMER_ATTR_EXTERNAL,
    845                                      cache_clean_timer_cb, bs);
    846         timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
    847                   (int64_t) s->cache_clean_interval * 1000);
    848     }
    849 }
    850 
    851 static void cache_clean_timer_del(BlockDriverState *bs)
    852 {
    853     BDRVQcow2State *s = bs->opaque;
    854     if (s->cache_clean_timer) {
    855         timer_free(s->cache_clean_timer);
    856         s->cache_clean_timer = NULL;
    857     }
    858 }
    859 
    860 static void qcow2_detach_aio_context(BlockDriverState *bs)
    861 {
    862     cache_clean_timer_del(bs);
    863 }
    864 
    865 static void qcow2_attach_aio_context(BlockDriverState *bs,
    866                                      AioContext *new_context)
    867 {
    868     cache_clean_timer_init(bs, new_context);
    869 }
    870 
    871 static bool read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
    872                              uint64_t *l2_cache_size,
    873                              uint64_t *l2_cache_entry_size,
    874                              uint64_t *refcount_cache_size, Error **errp)
    875 {
    876     BDRVQcow2State *s = bs->opaque;
    877     uint64_t combined_cache_size, l2_cache_max_setting;
    878     bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
    879     bool l2_cache_entry_size_set;
    880     int min_refcount_cache = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
    881     uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
    882     uint64_t max_l2_entries = DIV_ROUND_UP(virtual_disk_size, s->cluster_size);
    883     /* An L2 table is always one cluster in size so the max cache size
    884      * should be a multiple of the cluster size. */
    885     uint64_t max_l2_cache = ROUND_UP(max_l2_entries * l2_entry_size(s),
    886                                      s->cluster_size);
    887 
    888     combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
    889     l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
    890     refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
    891     l2_cache_entry_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE);
    892 
    893     combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
    894     l2_cache_max_setting = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE,
    895                                              DEFAULT_L2_CACHE_MAX_SIZE);
    896     *refcount_cache_size = qemu_opt_get_size(opts,
    897                                              QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);
    898 
    899     *l2_cache_entry_size = qemu_opt_get_size(
    900         opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE, s->cluster_size);
    901 
    902     *l2_cache_size = MIN(max_l2_cache, l2_cache_max_setting);
    903 
    904     if (combined_cache_size_set) {
    905         if (l2_cache_size_set && refcount_cache_size_set) {
    906             error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
    907                        " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set "
    908                        "at the same time");
    909             return false;
    910         } else if (l2_cache_size_set &&
    911                    (l2_cache_max_setting > combined_cache_size)) {
    912             error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed "
    913                        QCOW2_OPT_CACHE_SIZE);
    914             return false;
    915         } else if (*refcount_cache_size > combined_cache_size) {
    916             error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed "
    917                        QCOW2_OPT_CACHE_SIZE);
    918             return false;
    919         }
    920 
    921         if (l2_cache_size_set) {
    922             *refcount_cache_size = combined_cache_size - *l2_cache_size;
    923         } else if (refcount_cache_size_set) {
    924             *l2_cache_size = combined_cache_size - *refcount_cache_size;
    925         } else {
    926             /* Assign as much memory as possible to the L2 cache, and
    927              * use the remainder for the refcount cache */
    928             if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
    929                 *l2_cache_size = max_l2_cache;
    930                 *refcount_cache_size = combined_cache_size - *l2_cache_size;
    931             } else {
    932                 *refcount_cache_size =
    933                     MIN(combined_cache_size, min_refcount_cache);
    934                 *l2_cache_size = combined_cache_size - *refcount_cache_size;
    935             }
    936         }
    937     }
    938 
    939     /*
    940      * If the L2 cache is not enough to cover the whole disk then
    941      * default to 4KB entries. Smaller entries reduce the cost of
    942      * loads and evictions and increase I/O performance.
    943      */
    944     if (*l2_cache_size < max_l2_cache && !l2_cache_entry_size_set) {
    945         *l2_cache_entry_size = MIN(s->cluster_size, 4096);
    946     }
    947 
    948     /* l2_cache_size and refcount_cache_size are ensured to have at least
    949      * their minimum values in qcow2_update_options_prepare() */
    950 
    951     if (*l2_cache_entry_size < (1 << MIN_CLUSTER_BITS) ||
    952         *l2_cache_entry_size > s->cluster_size ||
    953         !is_power_of_2(*l2_cache_entry_size)) {
    954         error_setg(errp, "L2 cache entry size must be a power of two "
    955                    "between %d and the cluster size (%d)",
    956                    1 << MIN_CLUSTER_BITS, s->cluster_size);
    957         return false;
    958     }
    959 
    960     return true;
    961 }
    962 
    963 typedef struct Qcow2ReopenState {
    964     Qcow2Cache *l2_table_cache;
    965     Qcow2Cache *refcount_block_cache;
    966     int l2_slice_size; /* Number of entries in a slice of the L2 table */
    967     bool use_lazy_refcounts;
    968     int overlap_check;
    969     bool discard_passthrough[QCOW2_DISCARD_MAX];
    970     uint64_t cache_clean_interval;
    971     QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
    972 } Qcow2ReopenState;
    973 
    974 static int qcow2_update_options_prepare(BlockDriverState *bs,
    975                                         Qcow2ReopenState *r,
    976                                         QDict *options, int flags,
    977                                         Error **errp)
    978 {
    979     BDRVQcow2State *s = bs->opaque;
    980     QemuOpts *opts = NULL;
    981     const char *opt_overlap_check, *opt_overlap_check_template;
    982     int overlap_check_template = 0;
    983     uint64_t l2_cache_size, l2_cache_entry_size, refcount_cache_size;
    984     int i;
    985     const char *encryptfmt;
    986     QDict *encryptopts = NULL;
    987     int ret;
    988 
    989     qdict_extract_subqdict(options, &encryptopts, "encrypt.");
    990     encryptfmt = qdict_get_try_str(encryptopts, "format");
    991 
    992     opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
    993     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
    994         ret = -EINVAL;
    995         goto fail;
    996     }
    997 
    998     /* get L2 table/refcount block cache size from command line options */
    999     if (!read_cache_sizes(bs, opts, &l2_cache_size, &l2_cache_entry_size,
   1000                           &refcount_cache_size, errp)) {
   1001         ret = -EINVAL;
   1002         goto fail;
   1003     }
   1004 
   1005     l2_cache_size /= l2_cache_entry_size;
   1006     if (l2_cache_size < MIN_L2_CACHE_SIZE) {
   1007         l2_cache_size = MIN_L2_CACHE_SIZE;
   1008     }
   1009     if (l2_cache_size > INT_MAX) {
   1010         error_setg(errp, "L2 cache size too big");
   1011         ret = -EINVAL;
   1012         goto fail;
   1013     }
   1014 
   1015     refcount_cache_size /= s->cluster_size;
   1016     if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) {
   1017         refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE;
   1018     }
   1019     if (refcount_cache_size > INT_MAX) {
   1020         error_setg(errp, "Refcount cache size too big");
   1021         ret = -EINVAL;
   1022         goto fail;
   1023     }
   1024 
   1025     /* alloc new L2 table/refcount block cache, flush old one */
   1026     if (s->l2_table_cache) {
   1027         ret = qcow2_cache_flush(bs, s->l2_table_cache);
   1028         if (ret) {
   1029             error_setg_errno(errp, -ret, "Failed to flush the L2 table cache");
   1030             goto fail;
   1031         }
   1032     }
   1033 
   1034     if (s->refcount_block_cache) {
   1035         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
   1036         if (ret) {
   1037             error_setg_errno(errp, -ret,
   1038                              "Failed to flush the refcount block cache");
   1039             goto fail;
   1040         }
   1041     }
   1042 
   1043     r->l2_slice_size = l2_cache_entry_size / l2_entry_size(s);
   1044     r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size,
   1045                                            l2_cache_entry_size);
   1046     r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size,
   1047                                                  s->cluster_size);
   1048     if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) {
   1049         error_setg(errp, "Could not allocate metadata caches");
   1050         ret = -ENOMEM;
   1051         goto fail;
   1052     }
   1053 
   1054     /* New interval for cache cleanup timer */
   1055     r->cache_clean_interval =
   1056         qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL,
   1057                             DEFAULT_CACHE_CLEAN_INTERVAL);
   1058 #ifndef CONFIG_LINUX
   1059     if (r->cache_clean_interval != 0) {
   1060         error_setg(errp, QCOW2_OPT_CACHE_CLEAN_INTERVAL
   1061                    " not supported on this host");
   1062         ret = -EINVAL;
   1063         goto fail;
   1064     }
   1065 #endif
   1066     if (r->cache_clean_interval > UINT_MAX) {
   1067         error_setg(errp, "Cache clean interval too big");
   1068         ret = -EINVAL;
   1069         goto fail;
   1070     }
   1071 
   1072     /* lazy-refcounts; flush if going from enabled to disabled */
   1073     r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
   1074         (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
   1075     if (r->use_lazy_refcounts && s->qcow_version < 3) {
   1076         error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
   1077                    "qemu 1.1 compatibility level");
   1078         ret = -EINVAL;
   1079         goto fail;
   1080     }
   1081 
   1082     if (s->use_lazy_refcounts && !r->use_lazy_refcounts) {
   1083         ret = qcow2_mark_clean(bs);
   1084         if (ret < 0) {
   1085             error_setg_errno(errp, -ret, "Failed to disable lazy refcounts");
   1086             goto fail;
   1087         }
   1088     }
   1089 
   1090     /* Overlap check options */
   1091     opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP);
   1092     opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE);
   1093     if (opt_overlap_check_template && opt_overlap_check &&
   1094         strcmp(opt_overlap_check_template, opt_overlap_check))
   1095     {
   1096         error_setg(errp, "Conflicting values for qcow2 options '"
   1097                    QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE
   1098                    "' ('%s')", opt_overlap_check, opt_overlap_check_template);
   1099         ret = -EINVAL;
   1100         goto fail;
   1101     }
   1102     if (!opt_overlap_check) {
   1103         opt_overlap_check = opt_overlap_check_template ?: "cached";
   1104     }
   1105 
   1106     if (!strcmp(opt_overlap_check, "none")) {
   1107         overlap_check_template = 0;
   1108     } else if (!strcmp(opt_overlap_check, "constant")) {
   1109         overlap_check_template = QCOW2_OL_CONSTANT;
   1110     } else if (!strcmp(opt_overlap_check, "cached")) {
   1111         overlap_check_template = QCOW2_OL_CACHED;
   1112     } else if (!strcmp(opt_overlap_check, "all")) {
   1113         overlap_check_template = QCOW2_OL_ALL;
   1114     } else {
   1115         error_setg(errp, "Unsupported value '%s' for qcow2 option "
   1116                    "'overlap-check'. Allowed are any of the following: "
   1117                    "none, constant, cached, all", opt_overlap_check);
   1118         ret = -EINVAL;
   1119         goto fail;
   1120     }
   1121 
   1122     r->overlap_check = 0;
   1123     for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) {
   1124         /* overlap-check defines a template bitmask, but every flag may be
   1125          * overwritten through the associated boolean option */
   1126         r->overlap_check |=
   1127             qemu_opt_get_bool(opts, overlap_bool_option_names[i],
   1128                               overlap_check_template & (1 << i)) << i;
   1129     }
   1130 
   1131     r->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
   1132     r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
   1133     r->discard_passthrough[QCOW2_DISCARD_REQUEST] =
   1134         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
   1135                           flags & BDRV_O_UNMAP);
   1136     r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
   1137         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
   1138     r->discard_passthrough[QCOW2_DISCARD_OTHER] =
   1139         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
   1140 
   1141     switch (s->crypt_method_header) {
   1142     case QCOW_CRYPT_NONE:
   1143         if (encryptfmt) {
   1144             error_setg(errp, "No encryption in image header, but options "
   1145                        "specified format '%s'", encryptfmt);
   1146             ret = -EINVAL;
   1147             goto fail;
   1148         }
   1149         break;
   1150 
   1151     case QCOW_CRYPT_AES:
   1152         if (encryptfmt && !g_str_equal(encryptfmt, "aes")) {
   1153             error_setg(errp,
   1154                        "Header reported 'aes' encryption format but "
   1155                        "options specify '%s'", encryptfmt);
   1156             ret = -EINVAL;
   1157             goto fail;
   1158         }
   1159         qdict_put_str(encryptopts, "format", "qcow");
   1160         r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
   1161         if (!r->crypto_opts) {
   1162             ret = -EINVAL;
   1163             goto fail;
   1164         }
   1165         break;
   1166 
   1167     case QCOW_CRYPT_LUKS:
   1168         if (encryptfmt && !g_str_equal(encryptfmt, "luks")) {
   1169             error_setg(errp,
   1170                        "Header reported 'luks' encryption format but "
   1171                        "options specify '%s'", encryptfmt);
   1172             ret = -EINVAL;
   1173             goto fail;
   1174         }
   1175         qdict_put_str(encryptopts, "format", "luks");
   1176         r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
   1177         if (!r->crypto_opts) {
   1178             ret = -EINVAL;
   1179             goto fail;
   1180         }
   1181         break;
   1182 
   1183     default:
   1184         error_setg(errp, "Unsupported encryption method %d",
   1185                    s->crypt_method_header);
   1186         ret = -EINVAL;
   1187         goto fail;
   1188     }
   1189 
   1190     ret = 0;
   1191 fail:
   1192     qobject_unref(encryptopts);
   1193     qemu_opts_del(opts);
   1194     opts = NULL;
   1195     return ret;
   1196 }
   1197 
   1198 static void qcow2_update_options_commit(BlockDriverState *bs,
   1199                                         Qcow2ReopenState *r)
   1200 {
   1201     BDRVQcow2State *s = bs->opaque;
   1202     int i;
   1203 
   1204     if (s->l2_table_cache) {
   1205         qcow2_cache_destroy(s->l2_table_cache);
   1206     }
   1207     if (s->refcount_block_cache) {
   1208         qcow2_cache_destroy(s->refcount_block_cache);
   1209     }
   1210     s->l2_table_cache = r->l2_table_cache;
   1211     s->refcount_block_cache = r->refcount_block_cache;
   1212     s->l2_slice_size = r->l2_slice_size;
   1213 
   1214     s->overlap_check = r->overlap_check;
   1215     s->use_lazy_refcounts = r->use_lazy_refcounts;
   1216 
   1217     for (i = 0; i < QCOW2_DISCARD_MAX; i++) {
   1218         s->discard_passthrough[i] = r->discard_passthrough[i];
   1219     }
   1220 
   1221     if (s->cache_clean_interval != r->cache_clean_interval) {
   1222         cache_clean_timer_del(bs);
   1223         s->cache_clean_interval = r->cache_clean_interval;
   1224         cache_clean_timer_init(bs, bdrv_get_aio_context(bs));
   1225     }
   1226 
   1227     qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
   1228     s->crypto_opts = r->crypto_opts;
   1229 }
   1230 
   1231 static void qcow2_update_options_abort(BlockDriverState *bs,
   1232                                        Qcow2ReopenState *r)
   1233 {
   1234     if (r->l2_table_cache) {
   1235         qcow2_cache_destroy(r->l2_table_cache);
   1236     }
   1237     if (r->refcount_block_cache) {
   1238         qcow2_cache_destroy(r->refcount_block_cache);
   1239     }
   1240     qapi_free_QCryptoBlockOpenOptions(r->crypto_opts);
   1241 }
   1242 
   1243 static int qcow2_update_options(BlockDriverState *bs, QDict *options,
   1244                                 int flags, Error **errp)
   1245 {
   1246     Qcow2ReopenState r = {};
   1247     int ret;
   1248 
   1249     ret = qcow2_update_options_prepare(bs, &r, options, flags, errp);
   1250     if (ret >= 0) {
   1251         qcow2_update_options_commit(bs, &r);
   1252     } else {
   1253         qcow2_update_options_abort(bs, &r);
   1254     }
   1255 
   1256     return ret;
   1257 }
   1258 
   1259 static int validate_compression_type(BDRVQcow2State *s, Error **errp)
   1260 {
   1261     switch (s->compression_type) {
   1262     case QCOW2_COMPRESSION_TYPE_ZLIB:
   1263 #ifdef CONFIG_ZSTD
   1264     case QCOW2_COMPRESSION_TYPE_ZSTD:
   1265 #endif
   1266         break;
   1267 
   1268     default:
   1269         error_setg(errp, "qcow2: unknown compression type: %u",
   1270                    s->compression_type);
   1271         return -ENOTSUP;
   1272     }
   1273 
   1274     /*
   1275      * if the compression type differs from QCOW2_COMPRESSION_TYPE_ZLIB
   1276      * the incompatible feature flag must be set
   1277      */
   1278     if (s->compression_type == QCOW2_COMPRESSION_TYPE_ZLIB) {
   1279         if (s->incompatible_features & QCOW2_INCOMPAT_COMPRESSION) {
   1280             error_setg(errp, "qcow2: Compression type incompatible feature "
   1281                              "bit must not be set");
   1282             return -EINVAL;
   1283         }
   1284     } else {
   1285         if (!(s->incompatible_features & QCOW2_INCOMPAT_COMPRESSION)) {
   1286             error_setg(errp, "qcow2: Compression type incompatible feature "
   1287                              "bit must be set");
   1288             return -EINVAL;
   1289         }
   1290     }
   1291 
   1292     return 0;
   1293 }
   1294 
   1295 /* Called with s->lock held.  */
   1296 static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
   1297                                       int flags, bool open_data_file,
   1298                                       Error **errp)
   1299 {
   1300     ERRP_GUARD();
   1301     BDRVQcow2State *s = bs->opaque;
   1302     unsigned int len, i;
   1303     int ret = 0;
   1304     QCowHeader header;
   1305     uint64_t ext_end;
   1306     uint64_t l1_vm_state_index;
   1307     bool update_header = false;
   1308 
   1309     ret = bdrv_co_pread(bs->file, 0, sizeof(header), &header, 0);
   1310     if (ret < 0) {
   1311         error_setg_errno(errp, -ret, "Could not read qcow2 header");
   1312         goto fail;
   1313     }
   1314     header.magic = be32_to_cpu(header.magic);
   1315     header.version = be32_to_cpu(header.version);
   1316     header.backing_file_offset = be64_to_cpu(header.backing_file_offset);
   1317     header.backing_file_size = be32_to_cpu(header.backing_file_size);
   1318     header.size = be64_to_cpu(header.size);
   1319     header.cluster_bits = be32_to_cpu(header.cluster_bits);
   1320     header.crypt_method = be32_to_cpu(header.crypt_method);
   1321     header.l1_table_offset = be64_to_cpu(header.l1_table_offset);
   1322     header.l1_size = be32_to_cpu(header.l1_size);
   1323     header.refcount_table_offset = be64_to_cpu(header.refcount_table_offset);
   1324     header.refcount_table_clusters =
   1325         be32_to_cpu(header.refcount_table_clusters);
   1326     header.snapshots_offset = be64_to_cpu(header.snapshots_offset);
   1327     header.nb_snapshots = be32_to_cpu(header.nb_snapshots);
   1328 
   1329     if (header.magic != QCOW_MAGIC) {
   1330         error_setg(errp, "Image is not in qcow2 format");
   1331         ret = -EINVAL;
   1332         goto fail;
   1333     }
   1334     if (header.version < 2 || header.version > 3) {
   1335         error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version);
   1336         ret = -ENOTSUP;
   1337         goto fail;
   1338     }
   1339 
   1340     s->qcow_version = header.version;
   1341 
   1342     /* Initialise cluster size */
   1343     if (header.cluster_bits < MIN_CLUSTER_BITS ||
   1344         header.cluster_bits > MAX_CLUSTER_BITS) {
   1345         error_setg(errp, "Unsupported cluster size: 2^%" PRIu32,
   1346                    header.cluster_bits);
   1347         ret = -EINVAL;
   1348         goto fail;
   1349     }
   1350 
   1351     s->cluster_bits = header.cluster_bits;
   1352     s->cluster_size = 1 << s->cluster_bits;
   1353 
   1354     /* Initialise version 3 header fields */
   1355     if (header.version == 2) {
   1356         header.incompatible_features    = 0;
   1357         header.compatible_features      = 0;
   1358         header.autoclear_features       = 0;
   1359         header.refcount_order           = 4;
   1360         header.header_length            = 72;
   1361     } else {
   1362         header.incompatible_features =
   1363             be64_to_cpu(header.incompatible_features);
   1364         header.compatible_features = be64_to_cpu(header.compatible_features);
   1365         header.autoclear_features = be64_to_cpu(header.autoclear_features);
   1366         header.refcount_order = be32_to_cpu(header.refcount_order);
   1367         header.header_length = be32_to_cpu(header.header_length);
   1368 
   1369         if (header.header_length < 104) {
   1370             error_setg(errp, "qcow2 header too short");
   1371             ret = -EINVAL;
   1372             goto fail;
   1373         }
   1374     }
   1375 
   1376     if (header.header_length > s->cluster_size) {
   1377         error_setg(errp, "qcow2 header exceeds cluster size");
   1378         ret = -EINVAL;
   1379         goto fail;
   1380     }
   1381 
   1382     if (header.header_length > sizeof(header)) {
   1383         s->unknown_header_fields_size = header.header_length - sizeof(header);
   1384         s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
   1385         ret = bdrv_co_pread(bs->file, sizeof(header),
   1386                             s->unknown_header_fields_size,
   1387                             s->unknown_header_fields, 0);
   1388         if (ret < 0) {
   1389             error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
   1390                              "fields");
   1391             goto fail;
   1392         }
   1393     }
   1394 
   1395     if (header.backing_file_offset > s->cluster_size) {
   1396         error_setg(errp, "Invalid backing file offset");
   1397         ret = -EINVAL;
   1398         goto fail;
   1399     }
   1400 
   1401     if (header.backing_file_offset) {
   1402         ext_end = header.backing_file_offset;
   1403     } else {
   1404         ext_end = 1 << header.cluster_bits;
   1405     }
   1406 
   1407     /* Handle feature bits */
   1408     s->incompatible_features    = header.incompatible_features;
   1409     s->compatible_features      = header.compatible_features;
   1410     s->autoclear_features       = header.autoclear_features;
   1411 
   1412     /*
   1413      * Handle compression type
   1414      * Older qcow2 images don't contain the compression type header.
   1415      * Distinguish them by the header length and use
   1416      * the only valid (default) compression type in that case
   1417      */
   1418     if (header.header_length > offsetof(QCowHeader, compression_type)) {
   1419         s->compression_type = header.compression_type;
   1420     } else {
   1421         s->compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
   1422     }
   1423 
   1424     ret = validate_compression_type(s, errp);
   1425     if (ret) {
   1426         goto fail;
   1427     }
   1428 
   1429     if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
   1430         void *feature_table = NULL;
   1431         qcow2_read_extensions(bs, header.header_length, ext_end,
   1432                               &feature_table, flags, NULL, NULL);
   1433         report_unsupported_feature(errp, feature_table,
   1434                                    s->incompatible_features &
   1435                                    ~QCOW2_INCOMPAT_MASK);
   1436         ret = -ENOTSUP;
   1437         g_free(feature_table);
   1438         goto fail;
   1439     }
   1440 
   1441     if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
   1442         /* Corrupt images may not be written to unless they are being repaired
   1443          */
   1444         if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
   1445             error_setg(errp, "qcow2: Image is corrupt; cannot be opened "
   1446                        "read/write");
   1447             ret = -EACCES;
   1448             goto fail;
   1449         }
   1450     }
   1451 
   1452     s->subclusters_per_cluster =
   1453         has_subclusters(s) ? QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER : 1;
   1454     s->subcluster_size = s->cluster_size / s->subclusters_per_cluster;
   1455     s->subcluster_bits = ctz32(s->subcluster_size);
   1456 
   1457     if (s->subcluster_size < (1 << MIN_CLUSTER_BITS)) {
   1458         error_setg(errp, "Unsupported subcluster size: %d", s->subcluster_size);
   1459         ret = -EINVAL;
   1460         goto fail;
   1461     }
   1462 
   1463     /* Check support for various header values */
   1464     if (header.refcount_order > 6) {
   1465         error_setg(errp, "Reference count entry width too large; may not "
   1466                    "exceed 64 bits");
   1467         ret = -EINVAL;
   1468         goto fail;
   1469     }
   1470     s->refcount_order = header.refcount_order;
   1471     s->refcount_bits = 1 << s->refcount_order;
   1472     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
   1473     s->refcount_max += s->refcount_max - 1;
   1474 
   1475     s->crypt_method_header = header.crypt_method;
   1476     if (s->crypt_method_header) {
   1477         if (bdrv_uses_whitelist() &&
   1478             s->crypt_method_header == QCOW_CRYPT_AES) {
   1479             error_setg(errp,
   1480                        "Use of AES-CBC encrypted qcow2 images is no longer "
   1481                        "supported in system emulators");
   1482             error_append_hint(errp,
   1483                               "You can use 'qemu-img convert' to convert your "
   1484                               "image to an alternative supported format, such "
   1485                               "as unencrypted qcow2, or raw with the LUKS "
   1486                               "format instead.\n");
   1487             ret = -ENOSYS;
   1488             goto fail;
   1489         }
   1490 
   1491         if (s->crypt_method_header == QCOW_CRYPT_AES) {
   1492             s->crypt_physical_offset = false;
   1493         } else {
   1494             /* Assuming LUKS and any future crypt methods we
   1495              * add will all use physical offsets, due to the
   1496              * fact that the alternative is insecure...  */
   1497             s->crypt_physical_offset = true;
   1498         }
   1499 
   1500         bs->encrypted = true;
   1501     }
   1502 
   1503     s->l2_bits = s->cluster_bits - ctz32(l2_entry_size(s));
   1504     s->l2_size = 1 << s->l2_bits;
   1505     /* 2^(s->refcount_order - 3) is the refcount width in bytes */
   1506     s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3);
   1507     s->refcount_block_size = 1 << s->refcount_block_bits;
   1508     bs->total_sectors = header.size / BDRV_SECTOR_SIZE;
   1509     s->csize_shift = (62 - (s->cluster_bits - 8));
   1510     s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
   1511     s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
   1512 
   1513     s->refcount_table_offset = header.refcount_table_offset;
   1514     s->refcount_table_size =
   1515         header.refcount_table_clusters << (s->cluster_bits - 3);
   1516 
   1517     if (header.refcount_table_clusters == 0 && !(flags & BDRV_O_CHECK)) {
   1518         error_setg(errp, "Image does not contain a reference count table");
   1519         ret = -EINVAL;
   1520         goto fail;
   1521     }
   1522 
   1523     ret = qcow2_validate_table(bs, s->refcount_table_offset,
   1524                                header.refcount_table_clusters,
   1525                                s->cluster_size, QCOW_MAX_REFTABLE_SIZE,
   1526                                "Reference count table", errp);
   1527     if (ret < 0) {
   1528         goto fail;
   1529     }
   1530 
   1531     if (!(flags & BDRV_O_CHECK)) {
   1532         /*
   1533          * The total size in bytes of the snapshot table is checked in
   1534          * qcow2_read_snapshots() because the size of each snapshot is
   1535          * variable and we don't know it yet.
   1536          * Here we only check the offset and number of snapshots.
   1537          */
   1538         ret = qcow2_validate_table(bs, header.snapshots_offset,
   1539                                    header.nb_snapshots,
   1540                                    sizeof(QCowSnapshotHeader),
   1541                                    sizeof(QCowSnapshotHeader) *
   1542                                        QCOW_MAX_SNAPSHOTS,
   1543                                    "Snapshot table", errp);
   1544         if (ret < 0) {
   1545             goto fail;
   1546         }
   1547     }
   1548 
   1549     /* read the level 1 table */
   1550     ret = qcow2_validate_table(bs, header.l1_table_offset,
   1551                                header.l1_size, L1E_SIZE,
   1552                                QCOW_MAX_L1_SIZE, "Active L1 table", errp);
   1553     if (ret < 0) {
   1554         goto fail;
   1555     }
   1556     s->l1_size = header.l1_size;
   1557     s->l1_table_offset = header.l1_table_offset;
   1558 
   1559     l1_vm_state_index = size_to_l1(s, header.size);
   1560     if (l1_vm_state_index > INT_MAX) {
   1561         error_setg(errp, "Image is too big");
   1562         ret = -EFBIG;
   1563         goto fail;
   1564     }
   1565     s->l1_vm_state_index = l1_vm_state_index;
   1566 
   1567     /* the L1 table must contain at least enough entries to put
   1568        header.size bytes */
   1569     if (s->l1_size < s->l1_vm_state_index) {
   1570         error_setg(errp, "L1 table is too small");
   1571         ret = -EINVAL;
   1572         goto fail;
   1573     }
   1574 
   1575     if (s->l1_size > 0) {
   1576         s->l1_table = qemu_try_blockalign(bs->file->bs, s->l1_size * L1E_SIZE);
   1577         if (s->l1_table == NULL) {
   1578             error_setg(errp, "Could not allocate L1 table");
   1579             ret = -ENOMEM;
   1580             goto fail;
   1581         }
   1582         ret = bdrv_co_pread(bs->file, s->l1_table_offset, s->l1_size * L1E_SIZE,
   1583                             s->l1_table, 0);
   1584         if (ret < 0) {
   1585             error_setg_errno(errp, -ret, "Could not read L1 table");
   1586             goto fail;
   1587         }
   1588         for(i = 0;i < s->l1_size; i++) {
   1589             s->l1_table[i] = be64_to_cpu(s->l1_table[i]);
   1590         }
   1591     }
   1592 
   1593     /* Parse driver-specific options */
   1594     ret = qcow2_update_options(bs, options, flags, errp);
   1595     if (ret < 0) {
   1596         goto fail;
   1597     }
   1598 
   1599     s->flags = flags;
   1600 
   1601     ret = qcow2_refcount_init(bs);
   1602     if (ret != 0) {
   1603         error_setg_errno(errp, -ret, "Could not initialize refcount handling");
   1604         goto fail;
   1605     }
   1606 
   1607     QLIST_INIT(&s->cluster_allocs);
   1608     QTAILQ_INIT(&s->discards);
   1609 
   1610     /* read qcow2 extensions */
   1611     if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL,
   1612                               flags, &update_header, errp)) {
   1613         ret = -EINVAL;
   1614         goto fail;
   1615     }
   1616 
   1617     if (open_data_file) {
   1618         /* Open external data file */
   1619         s->data_file = bdrv_open_child(NULL, options, "data-file", bs,
   1620                                        &child_of_bds, BDRV_CHILD_DATA,
   1621                                        true, errp);
   1622         if (*errp) {
   1623             ret = -EINVAL;
   1624             goto fail;
   1625         }
   1626 
   1627         if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) {
   1628             if (!s->data_file && s->image_data_file) {
   1629                 s->data_file = bdrv_open_child(s->image_data_file, options,
   1630                                                "data-file", bs, &child_of_bds,
   1631                                                BDRV_CHILD_DATA, false, errp);
   1632                 if (!s->data_file) {
   1633                     ret = -EINVAL;
   1634                     goto fail;
   1635                 }
   1636             }
   1637             if (!s->data_file) {
   1638                 error_setg(errp, "'data-file' is required for this image");
   1639                 ret = -EINVAL;
   1640                 goto fail;
   1641             }
   1642 
   1643             /* No data here */
   1644             bs->file->role &= ~BDRV_CHILD_DATA;
   1645 
   1646             /* Must succeed because we have given up permissions if anything */
   1647             bdrv_child_refresh_perms(bs, bs->file, &error_abort);
   1648         } else {
   1649             if (s->data_file) {
   1650                 error_setg(errp, "'data-file' can only be set for images with "
   1651                                  "an external data file");
   1652                 ret = -EINVAL;
   1653                 goto fail;
   1654             }
   1655 
   1656             s->data_file = bs->file;
   1657 
   1658             if (data_file_is_raw(bs)) {
   1659                 error_setg(errp, "data-file-raw requires a data file");
   1660                 ret = -EINVAL;
   1661                 goto fail;
   1662             }
   1663         }
   1664     }
   1665 
   1666     /* qcow2_read_extension may have set up the crypto context
   1667      * if the crypt method needs a header region, some methods
   1668      * don't need header extensions, so must check here
   1669      */
   1670     if (s->crypt_method_header && !s->crypto) {
   1671         if (s->crypt_method_header == QCOW_CRYPT_AES) {
   1672             unsigned int cflags = 0;
   1673             if (flags & BDRV_O_NO_IO) {
   1674                 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
   1675             }
   1676             s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
   1677                                            NULL, NULL, cflags,
   1678                                            QCOW2_MAX_THREADS, errp);
   1679             if (!s->crypto) {
   1680                 ret = -EINVAL;
   1681                 goto fail;
   1682             }
   1683         } else if (!(flags & BDRV_O_NO_IO)) {
   1684             error_setg(errp, "Missing CRYPTO header for crypt method %d",
   1685                        s->crypt_method_header);
   1686             ret = -EINVAL;
   1687             goto fail;
   1688         }
   1689     }
   1690 
   1691     /* read the backing file name */
   1692     if (header.backing_file_offset != 0) {
   1693         len = header.backing_file_size;
   1694         if (len > MIN(1023, s->cluster_size - header.backing_file_offset) ||
   1695             len >= sizeof(bs->backing_file)) {
   1696             error_setg(errp, "Backing file name too long");
   1697             ret = -EINVAL;
   1698             goto fail;
   1699         }
   1700 
   1701         s->image_backing_file = g_malloc(len + 1);
   1702         ret = bdrv_co_pread(bs->file, header.backing_file_offset, len,
   1703                             s->image_backing_file, 0);
   1704         if (ret < 0) {
   1705             error_setg_errno(errp, -ret, "Could not read backing file name");
   1706             goto fail;
   1707         }
   1708         s->image_backing_file[len] = '\0';
   1709 
   1710         /*
   1711          * Update only when something has changed.  This function is called by
   1712          * qcow2_co_invalidate_cache(), and we do not want to reset
   1713          * auto_backing_file unless necessary.
   1714          */
   1715         if (!g_str_equal(s->image_backing_file, bs->backing_file)) {
   1716             pstrcpy(bs->backing_file, sizeof(bs->backing_file),
   1717                     s->image_backing_file);
   1718             pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
   1719                     s->image_backing_file);
   1720         }
   1721     }
   1722 
   1723     /*
   1724      * Internal snapshots; skip reading them in check mode, because
   1725      * we do not need them then, and we do not want to abort because
   1726      * of a broken table.
   1727      */
   1728     if (!(flags & BDRV_O_CHECK)) {
   1729         s->snapshots_offset = header.snapshots_offset;
   1730         s->nb_snapshots = header.nb_snapshots;
   1731 
   1732         ret = qcow2_read_snapshots(bs, errp);
   1733         if (ret < 0) {
   1734             goto fail;
   1735         }
   1736     }
   1737 
   1738     /* Clear unknown autoclear feature bits */
   1739     update_header |= s->autoclear_features & ~QCOW2_AUTOCLEAR_MASK;
   1740     update_header = update_header && bdrv_is_writable(bs);
   1741     if (update_header) {
   1742         s->autoclear_features &= QCOW2_AUTOCLEAR_MASK;
   1743     }
   1744 
   1745     /* == Handle persistent dirty bitmaps ==
   1746      *
   1747      * We want load dirty bitmaps in three cases:
   1748      *
   1749      * 1. Normal open of the disk in active mode, not related to invalidation
   1750      *    after migration.
   1751      *
   1752      * 2. Invalidation of the target vm after pre-copy phase of migration, if
   1753      *    bitmaps are _not_ migrating through migration channel, i.e.
   1754      *    'dirty-bitmaps' capability is disabled.
   1755      *
   1756      * 3. Invalidation of source vm after failed or canceled migration.
   1757      *    This is a very interesting case. There are two possible types of
   1758      *    bitmaps:
   1759      *
   1760      *    A. Stored on inactivation and removed. They should be loaded from the
   1761      *       image.
   1762      *
   1763      *    B. Not stored: not-persistent bitmaps and bitmaps, migrated through
   1764      *       the migration channel (with dirty-bitmaps capability).
   1765      *
   1766      *    On the other hand, there are two possible sub-cases:
   1767      *
   1768      *    3.1 disk was changed by somebody else while were inactive. In this
   1769      *        case all in-RAM dirty bitmaps (both persistent and not) are
   1770      *        definitely invalid. And we don't have any method to determine
   1771      *        this.
   1772      *
   1773      *        Simple and safe thing is to just drop all the bitmaps of type B on
   1774      *        inactivation. But in this case we lose bitmaps in valid 4.2 case.
   1775      *
   1776      *        On the other hand, resuming source vm, if disk was already changed
   1777      *        is a bad thing anyway: not only bitmaps, the whole vm state is
   1778      *        out of sync with disk.
   1779      *
   1780      *        This means, that user or management tool, who for some reason
   1781      *        decided to resume source vm, after disk was already changed by
   1782      *        target vm, should at least drop all dirty bitmaps by hand.
   1783      *
   1784      *        So, we can ignore this case for now, but TODO: "generation"
   1785      *        extension for qcow2, to determine, that image was changed after
   1786      *        last inactivation. And if it is changed, we will drop (or at least
   1787      *        mark as 'invalid' all the bitmaps of type B, both persistent
   1788      *        and not).
   1789      *
   1790      *    3.2 disk was _not_ changed while were inactive. Bitmaps may be saved
   1791      *        to disk ('dirty-bitmaps' capability disabled), or not saved
   1792      *        ('dirty-bitmaps' capability enabled), but we don't need to care
   1793      *        of: let's load bitmaps as always: stored bitmaps will be loaded,
   1794      *        and not stored has flag IN_USE=1 in the image and will be skipped
   1795      *        on loading.
   1796      *
   1797      * One remaining possible case when we don't want load bitmaps:
   1798      *
   1799      * 4. Open disk in inactive mode in target vm (bitmaps are migrating or
   1800      *    will be loaded on invalidation, no needs try loading them before)
   1801      */
   1802 
   1803     if (!(bdrv_get_flags(bs) & BDRV_O_INACTIVE)) {
   1804         /* It's case 1, 2 or 3.2. Or 3.1 which is BUG in management layer. */
   1805         bool header_updated;
   1806         if (!qcow2_load_dirty_bitmaps(bs, &header_updated, errp)) {
   1807             ret = -EINVAL;
   1808             goto fail;
   1809         }
   1810 
   1811         update_header = update_header && !header_updated;
   1812     }
   1813 
   1814     if (update_header) {
   1815         ret = qcow2_update_header(bs);
   1816         if (ret < 0) {
   1817             error_setg_errno(errp, -ret, "Could not update qcow2 header");
   1818             goto fail;
   1819         }
   1820     }
   1821 
   1822     bs->supported_zero_flags = header.version >= 3 ?
   1823                                BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK : 0;
   1824     bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
   1825 
   1826     /* Repair image if dirty */
   1827     if (!(flags & BDRV_O_CHECK) && bdrv_is_writable(bs) &&
   1828         (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
   1829         BdrvCheckResult result = {0};
   1830 
   1831         ret = qcow2_co_check_locked(bs, &result,
   1832                                     BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
   1833         if (ret < 0 || result.check_errors) {
   1834             if (ret >= 0) {
   1835                 ret = -EIO;
   1836             }
   1837             error_setg_errno(errp, -ret, "Could not repair dirty image");
   1838             goto fail;
   1839         }
   1840     }
   1841 
   1842 #ifdef DEBUG_ALLOC
   1843     {
   1844         BdrvCheckResult result = {0};
   1845         qcow2_check_refcounts(bs, &result, 0);
   1846     }
   1847 #endif
   1848 
   1849     qemu_co_queue_init(&s->thread_task_queue);
   1850 
   1851     return ret;
   1852 
   1853  fail:
   1854     g_free(s->image_data_file);
   1855     if (open_data_file && has_data_file(bs)) {
   1856         bdrv_unref_child(bs, s->data_file);
   1857         s->data_file = NULL;
   1858     }
   1859     g_free(s->unknown_header_fields);
   1860     cleanup_unknown_header_ext(bs);
   1861     qcow2_free_snapshots(bs);
   1862     qcow2_refcount_close(bs);
   1863     qemu_vfree(s->l1_table);
   1864     /* else pre-write overlap checks in cache_destroy may crash */
   1865     s->l1_table = NULL;
   1866     cache_clean_timer_del(bs);
   1867     if (s->l2_table_cache) {
   1868         qcow2_cache_destroy(s->l2_table_cache);
   1869     }
   1870     if (s->refcount_block_cache) {
   1871         qcow2_cache_destroy(s->refcount_block_cache);
   1872     }
   1873     qcrypto_block_free(s->crypto);
   1874     qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
   1875     return ret;
   1876 }
   1877 
   1878 typedef struct QCow2OpenCo {
   1879     BlockDriverState *bs;
   1880     QDict *options;
   1881     int flags;
   1882     Error **errp;
   1883     int ret;
   1884 } QCow2OpenCo;
   1885 
   1886 static void coroutine_fn qcow2_open_entry(void *opaque)
   1887 {
   1888     QCow2OpenCo *qoc = opaque;
   1889     BDRVQcow2State *s = qoc->bs->opaque;
   1890 
   1891     qemu_co_mutex_lock(&s->lock);
   1892     qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, true,
   1893                              qoc->errp);
   1894     qemu_co_mutex_unlock(&s->lock);
   1895 }
   1896 
   1897 static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
   1898                       Error **errp)
   1899 {
   1900     BDRVQcow2State *s = bs->opaque;
   1901     QCow2OpenCo qoc = {
   1902         .bs = bs,
   1903         .options = options,
   1904         .flags = flags,
   1905         .errp = errp,
   1906         .ret = -EINPROGRESS
   1907     };
   1908     int ret;
   1909 
   1910     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
   1911     if (ret < 0) {
   1912         return ret;
   1913     }
   1914 
   1915     /* Initialise locks */
   1916     qemu_co_mutex_init(&s->lock);
   1917 
   1918     if (qemu_in_coroutine()) {
   1919         /* From bdrv_co_create.  */
   1920         qcow2_open_entry(&qoc);
   1921     } else {
   1922         assert(qemu_get_current_aio_context() == qemu_get_aio_context());
   1923         qemu_coroutine_enter(qemu_coroutine_create(qcow2_open_entry, &qoc));
   1924         BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
   1925     }
   1926     return qoc.ret;
   1927 }
   1928 
   1929 static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
   1930 {
   1931     BDRVQcow2State *s = bs->opaque;
   1932 
   1933     if (bs->encrypted) {
   1934         /* Encryption works on a sector granularity */
   1935         bs->bl.request_alignment = qcrypto_block_get_sector_size(s->crypto);
   1936     }
   1937     bs->bl.pwrite_zeroes_alignment = s->subcluster_size;
   1938     bs->bl.pdiscard_alignment = s->cluster_size;
   1939 }
   1940 
   1941 static int qcow2_reopen_prepare(BDRVReopenState *state,
   1942                                 BlockReopenQueue *queue, Error **errp)
   1943 {
   1944     BDRVQcow2State *s = state->bs->opaque;
   1945     Qcow2ReopenState *r;
   1946     int ret;
   1947 
   1948     r = g_new0(Qcow2ReopenState, 1);
   1949     state->opaque = r;
   1950 
   1951     ret = qcow2_update_options_prepare(state->bs, r, state->options,
   1952                                        state->flags, errp);
   1953     if (ret < 0) {
   1954         goto fail;
   1955     }
   1956 
   1957     /* We need to write out any unwritten data if we reopen read-only. */
   1958     if ((state->flags & BDRV_O_RDWR) == 0) {
   1959         ret = qcow2_reopen_bitmaps_ro(state->bs, errp);
   1960         if (ret < 0) {
   1961             goto fail;
   1962         }
   1963 
   1964         ret = bdrv_flush(state->bs);
   1965         if (ret < 0) {
   1966             goto fail;
   1967         }
   1968 
   1969         ret = qcow2_mark_clean(state->bs);
   1970         if (ret < 0) {
   1971             goto fail;
   1972         }
   1973     }
   1974 
   1975     /*
   1976      * Without an external data file, s->data_file points to the same BdrvChild
   1977      * as bs->file. It needs to be resynced after reopen because bs->file may
   1978      * be changed. We can't use it in the meantime.
   1979      */
   1980     if (!has_data_file(state->bs)) {
   1981         assert(s->data_file == state->bs->file);
   1982         s->data_file = NULL;
   1983     }
   1984 
   1985     return 0;
   1986 
   1987 fail:
   1988     qcow2_update_options_abort(state->bs, r);
   1989     g_free(r);
   1990     return ret;
   1991 }
   1992 
   1993 static void qcow2_reopen_commit(BDRVReopenState *state)
   1994 {
   1995     BDRVQcow2State *s = state->bs->opaque;
   1996 
   1997     qcow2_update_options_commit(state->bs, state->opaque);
   1998     if (!s->data_file) {
   1999         /*
   2000          * If we don't have an external data file, s->data_file was cleared by
   2001          * qcow2_reopen_prepare() and needs to be updated.
   2002          */
   2003         s->data_file = state->bs->file;
   2004     }
   2005     g_free(state->opaque);
   2006 }
   2007 
   2008 static void qcow2_reopen_commit_post(BDRVReopenState *state)
   2009 {
   2010     if (state->flags & BDRV_O_RDWR) {
   2011         Error *local_err = NULL;
   2012 
   2013         if (qcow2_reopen_bitmaps_rw(state->bs, &local_err) < 0) {
   2014             /*
   2015              * This is not fatal, bitmaps just left read-only, so all following
   2016              * writes will fail. User can remove read-only bitmaps to unblock
   2017              * writes or retry reopen.
   2018              */
   2019             error_reportf_err(local_err,
   2020                               "%s: Failed to make dirty bitmaps writable: ",
   2021                               bdrv_get_node_name(state->bs));
   2022         }
   2023     }
   2024 }
   2025 
   2026 static void qcow2_reopen_abort(BDRVReopenState *state)
   2027 {
   2028     BDRVQcow2State *s = state->bs->opaque;
   2029 
   2030     if (!s->data_file) {
   2031         /*
   2032          * If we don't have an external data file, s->data_file was cleared by
   2033          * qcow2_reopen_prepare() and needs to be restored.
   2034          */
   2035         s->data_file = state->bs->file;
   2036     }
   2037     qcow2_update_options_abort(state->bs, state->opaque);
   2038     g_free(state->opaque);
   2039 }
   2040 
   2041 static void qcow2_join_options(QDict *options, QDict *old_options)
   2042 {
   2043     bool has_new_overlap_template =
   2044         qdict_haskey(options, QCOW2_OPT_OVERLAP) ||
   2045         qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE);
   2046     bool has_new_total_cache_size =
   2047         qdict_haskey(options, QCOW2_OPT_CACHE_SIZE);
   2048     bool has_all_cache_options;
   2049 
   2050     /* New overlap template overrides all old overlap options */
   2051     if (has_new_overlap_template) {
   2052         qdict_del(old_options, QCOW2_OPT_OVERLAP);
   2053         qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE);
   2054         qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER);
   2055         qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1);
   2056         qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2);
   2057         qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE);
   2058         qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK);
   2059         qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE);
   2060         qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1);
   2061         qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2);
   2062     }
   2063 
   2064     /* New total cache size overrides all old options */
   2065     if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) {
   2066         qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE);
   2067         qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
   2068     }
   2069 
   2070     qdict_join(options, old_options, false);
   2071 
   2072     /*
   2073      * If after merging all cache size options are set, an old total size is
   2074      * overwritten. Do keep all options, however, if all three are new. The
   2075      * resulting error message is what we want to happen.
   2076      */
   2077     has_all_cache_options =
   2078         qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) ||
   2079         qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) ||
   2080         qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
   2081 
   2082     if (has_all_cache_options && !has_new_total_cache_size) {
   2083         qdict_del(options, QCOW2_OPT_CACHE_SIZE);
   2084     }
   2085 }
   2086 
   2087 static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
   2088                                               bool want_zero,
   2089                                               int64_t offset, int64_t count,
   2090                                               int64_t *pnum, int64_t *map,
   2091                                               BlockDriverState **file)
   2092 {
   2093     BDRVQcow2State *s = bs->opaque;
   2094     uint64_t host_offset;
   2095     unsigned int bytes;
   2096     QCow2SubclusterType type;
   2097     int ret, status = 0;
   2098 
   2099     qemu_co_mutex_lock(&s->lock);
   2100 
   2101     if (!s->metadata_preallocation_checked) {
   2102         ret = qcow2_detect_metadata_preallocation(bs);
   2103         s->metadata_preallocation = (ret == 1);
   2104         s->metadata_preallocation_checked = true;
   2105     }
   2106 
   2107     bytes = MIN(INT_MAX, count);
   2108     ret = qcow2_get_host_offset(bs, offset, &bytes, &host_offset, &type);
   2109     qemu_co_mutex_unlock(&s->lock);
   2110     if (ret < 0) {
   2111         return ret;
   2112     }
   2113 
   2114     *pnum = bytes;
   2115 
   2116     if ((type == QCOW2_SUBCLUSTER_NORMAL ||
   2117          type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
   2118          type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC) && !s->crypto) {
   2119         *map = host_offset;
   2120         *file = s->data_file->bs;
   2121         status |= BDRV_BLOCK_OFFSET_VALID;
   2122     }
   2123     if (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
   2124         type == QCOW2_SUBCLUSTER_ZERO_ALLOC) {
   2125         status |= BDRV_BLOCK_ZERO;
   2126     } else if (type != QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN &&
   2127                type != QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC) {
   2128         status |= BDRV_BLOCK_DATA;
   2129     }
   2130     if (s->metadata_preallocation && (status & BDRV_BLOCK_DATA) &&
   2131         (status & BDRV_BLOCK_OFFSET_VALID))
   2132     {
   2133         status |= BDRV_BLOCK_RECURSE;
   2134     }
   2135     return status;
   2136 }
   2137 
   2138 static coroutine_fn int qcow2_handle_l2meta(BlockDriverState *bs,
   2139                                             QCowL2Meta **pl2meta,
   2140                                             bool link_l2)
   2141 {
   2142     int ret = 0;
   2143     QCowL2Meta *l2meta = *pl2meta;
   2144 
   2145     while (l2meta != NULL) {
   2146         QCowL2Meta *next;
   2147 
   2148         if (link_l2) {
   2149             ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
   2150             if (ret) {
   2151                 goto out;
   2152             }
   2153         } else {
   2154             qcow2_alloc_cluster_abort(bs, l2meta);
   2155         }
   2156 
   2157         /* Take the request off the list of running requests */
   2158         QLIST_REMOVE(l2meta, next_in_flight);
   2159 
   2160         qemu_co_queue_restart_all(&l2meta->dependent_requests);
   2161 
   2162         next = l2meta->next;
   2163         g_free(l2meta);
   2164         l2meta = next;
   2165     }
   2166 out:
   2167     *pl2meta = l2meta;
   2168     return ret;
   2169 }
   2170 
   2171 static coroutine_fn int
   2172 qcow2_co_preadv_encrypted(BlockDriverState *bs,
   2173                            uint64_t host_offset,
   2174                            uint64_t offset,
   2175                            uint64_t bytes,
   2176                            QEMUIOVector *qiov,
   2177                            uint64_t qiov_offset)
   2178 {
   2179     int ret;
   2180     BDRVQcow2State *s = bs->opaque;
   2181     uint8_t *buf;
   2182 
   2183     assert(bs->encrypted && s->crypto);
   2184     assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
   2185 
   2186     /*
   2187      * For encrypted images, read everything into a temporary
   2188      * contiguous buffer on which the AES functions can work.
   2189      * Also, decryption in a separate buffer is better as it
   2190      * prevents the guest from learning information about the
   2191      * encrypted nature of the virtual disk.
   2192      */
   2193 
   2194     buf = qemu_try_blockalign(s->data_file->bs, bytes);
   2195     if (buf == NULL) {
   2196         return -ENOMEM;
   2197     }
   2198 
   2199     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
   2200     ret = bdrv_co_pread(s->data_file, host_offset, bytes, buf, 0);
   2201     if (ret < 0) {
   2202         goto fail;
   2203     }
   2204 
   2205     if (qcow2_co_decrypt(bs, host_offset, offset, buf, bytes) < 0)
   2206     {
   2207         ret = -EIO;
   2208         goto fail;
   2209     }
   2210     qemu_iovec_from_buf(qiov, qiov_offset, buf, bytes);
   2211 
   2212 fail:
   2213     qemu_vfree(buf);
   2214 
   2215     return ret;
   2216 }
   2217 
   2218 typedef struct Qcow2AioTask {
   2219     AioTask task;
   2220 
   2221     BlockDriverState *bs;
   2222     QCow2SubclusterType subcluster_type; /* only for read */
   2223     uint64_t host_offset; /* or l2_entry for compressed read */
   2224     uint64_t offset;
   2225     uint64_t bytes;
   2226     QEMUIOVector *qiov;
   2227     uint64_t qiov_offset;
   2228     QCowL2Meta *l2meta; /* only for write */
   2229 } Qcow2AioTask;
   2230 
   2231 static coroutine_fn int qcow2_co_preadv_task_entry(AioTask *task);
   2232 static coroutine_fn int qcow2_add_task(BlockDriverState *bs,
   2233                                        AioTaskPool *pool,
   2234                                        AioTaskFunc func,
   2235                                        QCow2SubclusterType subcluster_type,
   2236                                        uint64_t host_offset,
   2237                                        uint64_t offset,
   2238                                        uint64_t bytes,
   2239                                        QEMUIOVector *qiov,
   2240                                        size_t qiov_offset,
   2241                                        QCowL2Meta *l2meta)
   2242 {
   2243     Qcow2AioTask local_task;
   2244     Qcow2AioTask *task = pool ? g_new(Qcow2AioTask, 1) : &local_task;
   2245 
   2246     *task = (Qcow2AioTask) {
   2247         .task.func = func,
   2248         .bs = bs,
   2249         .subcluster_type = subcluster_type,
   2250         .qiov = qiov,
   2251         .host_offset = host_offset,
   2252         .offset = offset,
   2253         .bytes = bytes,
   2254         .qiov_offset = qiov_offset,
   2255         .l2meta = l2meta,
   2256     };
   2257 
   2258     trace_qcow2_add_task(qemu_coroutine_self(), bs, pool,
   2259                          func == qcow2_co_preadv_task_entry ? "read" : "write",
   2260                          subcluster_type, host_offset, offset, bytes,
   2261                          qiov, qiov_offset);
   2262 
   2263     if (!pool) {
   2264         return func(&task->task);
   2265     }
   2266 
   2267     aio_task_pool_start_task(pool, &task->task);
   2268 
   2269     return 0;
   2270 }
   2271 
   2272 static coroutine_fn int qcow2_co_preadv_task(BlockDriverState *bs,
   2273                                              QCow2SubclusterType subc_type,
   2274                                              uint64_t host_offset,
   2275                                              uint64_t offset, uint64_t bytes,
   2276                                              QEMUIOVector *qiov,
   2277                                              size_t qiov_offset)
   2278 {
   2279     BDRVQcow2State *s = bs->opaque;
   2280 
   2281     switch (subc_type) {
   2282     case QCOW2_SUBCLUSTER_ZERO_PLAIN:
   2283     case QCOW2_SUBCLUSTER_ZERO_ALLOC:
   2284         /* Both zero types are handled in qcow2_co_preadv_part */
   2285         g_assert_not_reached();
   2286 
   2287     case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
   2288     case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
   2289         assert(bs->backing); /* otherwise handled in qcow2_co_preadv_part */
   2290 
   2291         BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
   2292         return bdrv_co_preadv_part(bs->backing, offset, bytes,
   2293                                    qiov, qiov_offset, 0);
   2294 
   2295     case QCOW2_SUBCLUSTER_COMPRESSED:
   2296         return qcow2_co_preadv_compressed(bs, host_offset,
   2297                                           offset, bytes, qiov, qiov_offset);
   2298 
   2299     case QCOW2_SUBCLUSTER_NORMAL:
   2300         if (bs->encrypted) {
   2301             return qcow2_co_preadv_encrypted(bs, host_offset,
   2302                                              offset, bytes, qiov, qiov_offset);
   2303         }
   2304 
   2305         BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
   2306         return bdrv_co_preadv_part(s->data_file, host_offset,
   2307                                    bytes, qiov, qiov_offset, 0);
   2308 
   2309     default:
   2310         g_assert_not_reached();
   2311     }
   2312 
   2313     g_assert_not_reached();
   2314 }
   2315 
   2316 static coroutine_fn int qcow2_co_preadv_task_entry(AioTask *task)
   2317 {
   2318     Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);
   2319 
   2320     assert(!t->l2meta);
   2321 
   2322     return qcow2_co_preadv_task(t->bs, t->subcluster_type,
   2323                                 t->host_offset, t->offset, t->bytes,
   2324                                 t->qiov, t->qiov_offset);
   2325 }
   2326 
   2327 static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs,
   2328                                              int64_t offset, int64_t bytes,
   2329                                              QEMUIOVector *qiov,
   2330                                              size_t qiov_offset,
   2331                                              BdrvRequestFlags flags)
   2332 {
   2333     BDRVQcow2State *s = bs->opaque;
   2334     int ret = 0;
   2335     unsigned int cur_bytes; /* number of bytes in current iteration */
   2336     uint64_t host_offset = 0;
   2337     QCow2SubclusterType type;
   2338     AioTaskPool *aio = NULL;
   2339 
   2340     while (bytes != 0 && aio_task_pool_status(aio) == 0) {
   2341         /* prepare next request */
   2342         cur_bytes = MIN(bytes, INT_MAX);
   2343         if (s->crypto) {
   2344             cur_bytes = MIN(cur_bytes,
   2345                             QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
   2346         }
   2347 
   2348         qemu_co_mutex_lock(&s->lock);
   2349         ret = qcow2_get_host_offset(bs, offset, &cur_bytes,
   2350                                     &host_offset, &type);
   2351         qemu_co_mutex_unlock(&s->lock);
   2352         if (ret < 0) {
   2353             goto out;
   2354         }
   2355 
   2356         if (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
   2357             type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
   2358             (type == QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN && !bs->backing) ||
   2359             (type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC && !bs->backing))
   2360         {
   2361             qemu_iovec_memset(qiov, qiov_offset, 0, cur_bytes);
   2362         } else {
   2363             if (!aio && cur_bytes != bytes) {
   2364                 aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
   2365             }
   2366             ret = qcow2_add_task(bs, aio, qcow2_co_preadv_task_entry, type,
   2367                                  host_offset, offset, cur_bytes,
   2368                                  qiov, qiov_offset, NULL);
   2369             if (ret < 0) {
   2370                 goto out;
   2371             }
   2372         }
   2373 
   2374         bytes -= cur_bytes;
   2375         offset += cur_bytes;
   2376         qiov_offset += cur_bytes;
   2377     }
   2378 
   2379 out:
   2380     if (aio) {
   2381         aio_task_pool_wait_all(aio);
   2382         if (ret == 0) {
   2383             ret = aio_task_pool_status(aio);
   2384         }
   2385         g_free(aio);
   2386     }
   2387 
   2388     return ret;
   2389 }
   2390 
   2391 /* Check if it's possible to merge a write request with the writing of
   2392  * the data from the COW regions */
   2393 static bool merge_cow(uint64_t offset, unsigned bytes,
   2394                       QEMUIOVector *qiov, size_t qiov_offset,
   2395                       QCowL2Meta *l2meta)
   2396 {
   2397     QCowL2Meta *m;
   2398 
   2399     for (m = l2meta; m != NULL; m = m->next) {
   2400         /* If both COW regions are empty then there's nothing to merge */
   2401         if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
   2402             continue;
   2403         }
   2404 
   2405         /* If COW regions are handled already, skip this too */
   2406         if (m->skip_cow) {
   2407             continue;
   2408         }
   2409 
   2410         /*
   2411          * The write request should start immediately after the first
   2412          * COW region. This does not always happen because the area
   2413          * touched by the request can be larger than the one defined
   2414          * by @m (a single request can span an area consisting of a
   2415          * mix of previously unallocated and allocated clusters, that
   2416          * is why @l2meta is a list).
   2417          */
   2418         if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
   2419             /* In this case the request starts before this region */
   2420             assert(offset < l2meta_cow_start(m));
   2421             assert(m->cow_start.nb_bytes == 0);
   2422             continue;
   2423         }
   2424 
   2425         /* The write request should end immediately before the second
   2426          * COW region (see above for why it does not always happen) */
   2427         if (m->offset + m->cow_end.offset != offset + bytes) {
   2428             assert(offset + bytes > m->offset + m->cow_end.offset);
   2429             assert(m->cow_end.nb_bytes == 0);
   2430             continue;
   2431         }
   2432 
   2433         /* Make sure that adding both COW regions to the QEMUIOVector
   2434          * does not exceed IOV_MAX */
   2435         if (qemu_iovec_subvec_niov(qiov, qiov_offset, bytes) > IOV_MAX - 2) {
   2436             continue;
   2437         }
   2438 
   2439         m->data_qiov = qiov;
   2440         m->data_qiov_offset = qiov_offset;
   2441         return true;
   2442     }
   2443 
   2444     return false;
   2445 }
   2446 
   2447 /*
   2448  * Return 1 if the COW regions read as zeroes, 0 if not, < 0 on error.
   2449  * Note that returning 0 does not guarantee non-zero data.
   2450  */
   2451 static int coroutine_fn is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
   2452 {
   2453     /*
   2454      * This check is designed for optimization shortcut so it must be
   2455      * efficient.
   2456      * Instead of is_zero(), use bdrv_co_is_zero_fast() as it is
   2457      * faster (but not as accurate and can result in false negatives).
   2458      */
   2459     int ret = bdrv_co_is_zero_fast(bs, m->offset + m->cow_start.offset,
   2460                                    m->cow_start.nb_bytes);
   2461     if (ret <= 0) {
   2462         return ret;
   2463     }
   2464 
   2465     return bdrv_co_is_zero_fast(bs, m->offset + m->cow_end.offset,
   2466                                 m->cow_end.nb_bytes);
   2467 }
   2468 
   2469 static int coroutine_fn handle_alloc_space(BlockDriverState *bs,
   2470                                            QCowL2Meta *l2meta)
   2471 {
   2472     BDRVQcow2State *s = bs->opaque;
   2473     QCowL2Meta *m;
   2474 
   2475     if (!(s->data_file->bs->supported_zero_flags & BDRV_REQ_NO_FALLBACK)) {
   2476         return 0;
   2477     }
   2478 
   2479     if (bs->encrypted) {
   2480         return 0;
   2481     }
   2482 
   2483     for (m = l2meta; m != NULL; m = m->next) {
   2484         int ret;
   2485         uint64_t start_offset = m->alloc_offset + m->cow_start.offset;
   2486         unsigned nb_bytes = m->cow_end.offset + m->cow_end.nb_bytes -
   2487             m->cow_start.offset;
   2488 
   2489         if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
   2490             continue;
   2491         }
   2492 
   2493         ret = is_zero_cow(bs, m);
   2494         if (ret < 0) {
   2495             return ret;
   2496         } else if (ret == 0) {
   2497             continue;
   2498         }
   2499 
   2500         /*
   2501          * instead of writing zero COW buffers,
   2502          * efficiently zero out the whole clusters
   2503          */
   2504 
   2505         ret = qcow2_pre_write_overlap_check(bs, 0, start_offset, nb_bytes,
   2506                                             true);
   2507         if (ret < 0) {
   2508             return ret;
   2509         }
   2510 
   2511         BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
   2512         ret = bdrv_co_pwrite_zeroes(s->data_file, start_offset, nb_bytes,
   2513                                     BDRV_REQ_NO_FALLBACK);
   2514         if (ret < 0) {
   2515             if (ret != -ENOTSUP && ret != -EAGAIN) {
   2516                 return ret;
   2517             }
   2518             continue;
   2519         }
   2520 
   2521         trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters);
   2522         m->skip_cow = true;
   2523     }
   2524     return 0;
   2525 }
   2526 
   2527 /*
   2528  * qcow2_co_pwritev_task
   2529  * Called with s->lock unlocked
   2530  * l2meta  - if not NULL, qcow2_co_pwritev_task() will consume it. Caller must
   2531  *           not use it somehow after qcow2_co_pwritev_task() call
   2532  */
   2533 static coroutine_fn int qcow2_co_pwritev_task(BlockDriverState *bs,
   2534                                               uint64_t host_offset,
   2535                                               uint64_t offset, uint64_t bytes,
   2536                                               QEMUIOVector *qiov,
   2537                                               uint64_t qiov_offset,
   2538                                               QCowL2Meta *l2meta)
   2539 {
   2540     int ret;
   2541     BDRVQcow2State *s = bs->opaque;
   2542     void *crypt_buf = NULL;
   2543     QEMUIOVector encrypted_qiov;
   2544 
   2545     if (bs->encrypted) {
   2546         assert(s->crypto);
   2547         assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
   2548         crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
   2549         if (crypt_buf == NULL) {
   2550             ret = -ENOMEM;
   2551             goto out_unlocked;
   2552         }
   2553         qemu_iovec_to_buf(qiov, qiov_offset, crypt_buf, bytes);
   2554 
   2555         if (qcow2_co_encrypt(bs, host_offset, offset, crypt_buf, bytes) < 0) {
   2556             ret = -EIO;
   2557             goto out_unlocked;
   2558         }
   2559 
   2560         qemu_iovec_init_buf(&encrypted_qiov, crypt_buf, bytes);
   2561         qiov = &encrypted_qiov;
   2562         qiov_offset = 0;
   2563     }
   2564 
   2565     /* Try to efficiently initialize the physical space with zeroes */
   2566     ret = handle_alloc_space(bs, l2meta);
   2567     if (ret < 0) {
   2568         goto out_unlocked;
   2569     }
   2570 
   2571     /*
   2572      * If we need to do COW, check if it's possible to merge the
   2573      * writing of the guest data together with that of the COW regions.
   2574      * If it's not possible (or not necessary) then write the
   2575      * guest data now.
   2576      */
   2577     if (!merge_cow(offset, bytes, qiov, qiov_offset, l2meta)) {
   2578         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
   2579         trace_qcow2_writev_data(qemu_coroutine_self(), host_offset);
   2580         ret = bdrv_co_pwritev_part(s->data_file, host_offset,
   2581                                    bytes, qiov, qiov_offset, 0);
   2582         if (ret < 0) {
   2583             goto out_unlocked;
   2584         }
   2585     }
   2586 
   2587     qemu_co_mutex_lock(&s->lock);
   2588 
   2589     ret = qcow2_handle_l2meta(bs, &l2meta, true);
   2590     goto out_locked;
   2591 
   2592 out_unlocked:
   2593     qemu_co_mutex_lock(&s->lock);
   2594 
   2595 out_locked:
   2596     qcow2_handle_l2meta(bs, &l2meta, false);
   2597     qemu_co_mutex_unlock(&s->lock);
   2598 
   2599     qemu_vfree(crypt_buf);
   2600 
   2601     return ret;
   2602 }
   2603 
   2604 static coroutine_fn int qcow2_co_pwritev_task_entry(AioTask *task)
   2605 {
   2606     Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);
   2607 
   2608     assert(!t->subcluster_type);
   2609 
   2610     return qcow2_co_pwritev_task(t->bs, t->host_offset,
   2611                                  t->offset, t->bytes, t->qiov, t->qiov_offset,
   2612                                  t->l2meta);
   2613 }
   2614 
   2615 static coroutine_fn int qcow2_co_pwritev_part(
   2616         BlockDriverState *bs, int64_t offset, int64_t bytes,
   2617         QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags)
   2618 {
   2619     BDRVQcow2State *s = bs->opaque;
   2620     int offset_in_cluster;
   2621     int ret;
   2622     unsigned int cur_bytes; /* number of sectors in current iteration */
   2623     uint64_t host_offset;
   2624     QCowL2Meta *l2meta = NULL;
   2625     AioTaskPool *aio = NULL;
   2626 
   2627     trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
   2628 
   2629     while (bytes != 0 && aio_task_pool_status(aio) == 0) {
   2630 
   2631         l2meta = NULL;
   2632 
   2633         trace_qcow2_writev_start_part(qemu_coroutine_self());
   2634         offset_in_cluster = offset_into_cluster(s, offset);
   2635         cur_bytes = MIN(bytes, INT_MAX);
   2636         if (bs->encrypted) {
   2637             cur_bytes = MIN(cur_bytes,
   2638                             QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
   2639                             - offset_in_cluster);
   2640         }
   2641 
   2642         qemu_co_mutex_lock(&s->lock);
   2643 
   2644         ret = qcow2_alloc_host_offset(bs, offset, &cur_bytes,
   2645                                       &host_offset, &l2meta);
   2646         if (ret < 0) {
   2647             goto out_locked;
   2648         }
   2649 
   2650         ret = qcow2_pre_write_overlap_check(bs, 0, host_offset,
   2651                                             cur_bytes, true);
   2652         if (ret < 0) {
   2653             goto out_locked;
   2654         }
   2655 
   2656         qemu_co_mutex_unlock(&s->lock);
   2657 
   2658         if (!aio && cur_bytes != bytes) {
   2659             aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
   2660         }
   2661         ret = qcow2_add_task(bs, aio, qcow2_co_pwritev_task_entry, 0,
   2662                              host_offset, offset,
   2663                              cur_bytes, qiov, qiov_offset, l2meta);
   2664         l2meta = NULL; /* l2meta is consumed by qcow2_co_pwritev_task() */
   2665         if (ret < 0) {
   2666             goto fail_nometa;
   2667         }
   2668 
   2669         bytes -= cur_bytes;
   2670         offset += cur_bytes;
   2671         qiov_offset += cur_bytes;
   2672         trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
   2673     }
   2674     ret = 0;
   2675 
   2676     qemu_co_mutex_lock(&s->lock);
   2677 
   2678 out_locked:
   2679     qcow2_handle_l2meta(bs, &l2meta, false);
   2680 
   2681     qemu_co_mutex_unlock(&s->lock);
   2682 
   2683 fail_nometa:
   2684     if (aio) {
   2685         aio_task_pool_wait_all(aio);
   2686         if (ret == 0) {
   2687             ret = aio_task_pool_status(aio);
   2688         }
   2689         g_free(aio);
   2690     }
   2691 
   2692     trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
   2693 
   2694     return ret;
   2695 }
   2696 
   2697 static int qcow2_inactivate(BlockDriverState *bs)
   2698 {
   2699     BDRVQcow2State *s = bs->opaque;
   2700     int ret, result = 0;
   2701     Error *local_err = NULL;
   2702 
   2703     qcow2_store_persistent_dirty_bitmaps(bs, true, &local_err);
   2704     if (local_err != NULL) {
   2705         result = -EINVAL;
   2706         error_reportf_err(local_err, "Lost persistent bitmaps during "
   2707                           "inactivation of node '%s': ",
   2708                           bdrv_get_device_or_node_name(bs));
   2709     }
   2710 
   2711     ret = qcow2_cache_flush(bs, s->l2_table_cache);
   2712     if (ret) {
   2713         result = ret;
   2714         error_report("Failed to flush the L2 table cache: %s",
   2715                      strerror(-ret));
   2716     }
   2717 
   2718     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
   2719     if (ret) {
   2720         result = ret;
   2721         error_report("Failed to flush the refcount block cache: %s",
   2722                      strerror(-ret));
   2723     }
   2724 
   2725     if (result == 0) {
   2726         qcow2_mark_clean(bs);
   2727     }
   2728 
   2729     return result;
   2730 }
   2731 
   2732 static void qcow2_do_close(BlockDriverState *bs, bool close_data_file)
   2733 {
   2734     BDRVQcow2State *s = bs->opaque;
   2735     qemu_vfree(s->l1_table);
   2736     /* else pre-write overlap checks in cache_destroy may crash */
   2737     s->l1_table = NULL;
   2738 
   2739     if (!(s->flags & BDRV_O_INACTIVE)) {
   2740         qcow2_inactivate(bs);
   2741     }
   2742 
   2743     cache_clean_timer_del(bs);
   2744     qcow2_cache_destroy(s->l2_table_cache);
   2745     qcow2_cache_destroy(s->refcount_block_cache);
   2746 
   2747     qcrypto_block_free(s->crypto);
   2748     s->crypto = NULL;
   2749     qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
   2750 
   2751     g_free(s->unknown_header_fields);
   2752     cleanup_unknown_header_ext(bs);
   2753 
   2754     g_free(s->image_data_file);
   2755     g_free(s->image_backing_file);
   2756     g_free(s->image_backing_format);
   2757 
   2758     if (close_data_file && has_data_file(bs)) {
   2759         bdrv_unref_child(bs, s->data_file);
   2760         s->data_file = NULL;
   2761     }
   2762 
   2763     qcow2_refcount_close(bs);
   2764     qcow2_free_snapshots(bs);
   2765 }
   2766 
   2767 static void qcow2_close(BlockDriverState *bs)
   2768 {
   2769     qcow2_do_close(bs, true);
   2770 }
   2771 
   2772 static void coroutine_fn qcow2_co_invalidate_cache(BlockDriverState *bs,
   2773                                                    Error **errp)
   2774 {
   2775     ERRP_GUARD();
   2776     BDRVQcow2State *s = bs->opaque;
   2777     BdrvChild *data_file;
   2778     int flags = s->flags;
   2779     QCryptoBlock *crypto = NULL;
   2780     QDict *options;
   2781     int ret;
   2782 
   2783     /*
   2784      * Backing files are read-only which makes all of their metadata immutable,
   2785      * that means we don't have to worry about reopening them here.
   2786      */
   2787 
   2788     crypto = s->crypto;
   2789     s->crypto = NULL;
   2790 
   2791     /*
   2792      * Do not reopen s->data_file (i.e., have qcow2_do_close() not close it,
   2793      * and then prevent qcow2_do_open() from opening it), because this function
   2794      * runs in the I/O path and as such we must not invoke global-state
   2795      * functions like bdrv_unref_child() and bdrv_open_child().
   2796      */
   2797 
   2798     qcow2_do_close(bs, false);
   2799 
   2800     data_file = s->data_file;
   2801     memset(s, 0, sizeof(BDRVQcow2State));
   2802     s->data_file = data_file;
   2803 
   2804     options = qdict_clone_shallow(bs->options);
   2805 
   2806     flags &= ~BDRV_O_INACTIVE;
   2807     qemu_co_mutex_lock(&s->lock);
   2808     ret = qcow2_do_open(bs, options, flags, false, errp);
   2809     qemu_co_mutex_unlock(&s->lock);
   2810     qobject_unref(options);
   2811     if (ret < 0) {
   2812         error_prepend(errp, "Could not reopen qcow2 layer: ");
   2813         bs->drv = NULL;
   2814         return;
   2815     }
   2816 
   2817     s->crypto = crypto;
   2818 }
   2819 
   2820 static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
   2821     size_t len, size_t buflen)
   2822 {
   2823     QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
   2824     size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
   2825 
   2826     if (buflen < ext_len) {
   2827         return -ENOSPC;
   2828     }
   2829 
   2830     *ext_backing_fmt = (QCowExtension) {
   2831         .magic  = cpu_to_be32(magic),
   2832         .len    = cpu_to_be32(len),
   2833     };
   2834 
   2835     if (len) {
   2836         memcpy(buf + sizeof(QCowExtension), s, len);
   2837     }
   2838 
   2839     return ext_len;
   2840 }
   2841 
   2842 /*
   2843  * Updates the qcow2 header, including the variable length parts of it, i.e.
   2844  * the backing file name and all extensions. qcow2 was not designed to allow
   2845  * such changes, so if we run out of space (we can only use the first cluster)
   2846  * this function may fail.
   2847  *
   2848  * Returns 0 on success, -errno in error cases.
   2849  */
   2850 int qcow2_update_header(BlockDriverState *bs)
   2851 {
   2852     BDRVQcow2State *s = bs->opaque;
   2853     QCowHeader *header;
   2854     char *buf;
   2855     size_t buflen = s->cluster_size;
   2856     int ret;
   2857     uint64_t total_size;
   2858     uint32_t refcount_table_clusters;
   2859     size_t header_length;
   2860     Qcow2UnknownHeaderExtension *uext;
   2861 
   2862     buf = qemu_blockalign(bs, buflen);
   2863 
   2864     /* Header structure */
   2865     header = (QCowHeader*) buf;
   2866 
   2867     if (buflen < sizeof(*header)) {
   2868         ret = -ENOSPC;
   2869         goto fail;
   2870     }
   2871 
   2872     header_length = sizeof(*header) + s->unknown_header_fields_size;
   2873     total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
   2874     refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
   2875 
   2876     ret = validate_compression_type(s, NULL);
   2877     if (ret) {
   2878         goto fail;
   2879     }
   2880 
   2881     *header = (QCowHeader) {
   2882         /* Version 2 fields */
   2883         .magic                  = cpu_to_be32(QCOW_MAGIC),
   2884         .version                = cpu_to_be32(s->qcow_version),
   2885         .backing_file_offset    = 0,
   2886         .backing_file_size      = 0,
   2887         .cluster_bits           = cpu_to_be32(s->cluster_bits),
   2888         .size                   = cpu_to_be64(total_size),
   2889         .crypt_method           = cpu_to_be32(s->crypt_method_header),
   2890         .l1_size                = cpu_to_be32(s->l1_size),
   2891         .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
   2892         .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
   2893         .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
   2894         .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
   2895         .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
   2896 
   2897         /* Version 3 fields */
   2898         .incompatible_features  = cpu_to_be64(s->incompatible_features),
   2899         .compatible_features    = cpu_to_be64(s->compatible_features),
   2900         .autoclear_features     = cpu_to_be64(s->autoclear_features),
   2901         .refcount_order         = cpu_to_be32(s->refcount_order),
   2902         .header_length          = cpu_to_be32(header_length),
   2903         .compression_type       = s->compression_type,
   2904     };
   2905 
   2906     /* For older versions, write a shorter header */
   2907     switch (s->qcow_version) {
   2908     case 2:
   2909         ret = offsetof(QCowHeader, incompatible_features);
   2910         break;
   2911     case 3:
   2912         ret = sizeof(*header);
   2913         break;
   2914     default:
   2915         ret = -EINVAL;
   2916         goto fail;
   2917     }
   2918 
   2919     buf += ret;
   2920     buflen -= ret;
   2921     memset(buf, 0, buflen);
   2922 
   2923     /* Preserve any unknown field in the header */
   2924     if (s->unknown_header_fields_size) {
   2925         if (buflen < s->unknown_header_fields_size) {
   2926             ret = -ENOSPC;
   2927             goto fail;
   2928         }
   2929 
   2930         memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
   2931         buf += s->unknown_header_fields_size;
   2932         buflen -= s->unknown_header_fields_size;
   2933     }
   2934 
   2935     /* Backing file format header extension */
   2936     if (s->image_backing_format) {
   2937         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
   2938                              s->image_backing_format,
   2939                              strlen(s->image_backing_format),
   2940                              buflen);
   2941         if (ret < 0) {
   2942             goto fail;
   2943         }
   2944 
   2945         buf += ret;
   2946         buflen -= ret;
   2947     }
   2948 
   2949     /* External data file header extension */
   2950     if (has_data_file(bs) && s->image_data_file) {
   2951         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_DATA_FILE,
   2952                              s->image_data_file, strlen(s->image_data_file),
   2953                              buflen);
   2954         if (ret < 0) {
   2955             goto fail;
   2956         }
   2957 
   2958         buf += ret;
   2959         buflen -= ret;
   2960     }
   2961 
   2962     /* Full disk encryption header pointer extension */
   2963     if (s->crypto_header.offset != 0) {
   2964         s->crypto_header.offset = cpu_to_be64(s->crypto_header.offset);
   2965         s->crypto_header.length = cpu_to_be64(s->crypto_header.length);
   2966         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_CRYPTO_HEADER,
   2967                              &s->crypto_header, sizeof(s->crypto_header),
   2968                              buflen);
   2969         s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset);
   2970         s->crypto_header.length = be64_to_cpu(s->crypto_header.length);
   2971         if (ret < 0) {
   2972             goto fail;
   2973         }
   2974         buf += ret;
   2975         buflen -= ret;
   2976     }
   2977 
   2978     /*
   2979      * Feature table.  A mere 8 feature names occupies 392 bytes, and
   2980      * when coupled with the v3 minimum header of 104 bytes plus the
   2981      * 8-byte end-of-extension marker, that would leave only 8 bytes
   2982      * for a backing file name in an image with 512-byte clusters.
   2983      * Thus, we choose to omit this header for cluster sizes 4k and
   2984      * smaller.
   2985      */
   2986     if (s->qcow_version >= 3 && s->cluster_size > 4096) {
   2987         static const Qcow2Feature features[] = {
   2988             {
   2989                 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
   2990                 .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
   2991                 .name = "dirty bit",
   2992             },
   2993             {
   2994                 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
   2995                 .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
   2996                 .name = "corrupt bit",
   2997             },
   2998             {
   2999                 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
   3000                 .bit  = QCOW2_INCOMPAT_DATA_FILE_BITNR,
   3001                 .name = "external data file",
   3002             },
   3003             {
   3004                 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
   3005                 .bit  = QCOW2_INCOMPAT_COMPRESSION_BITNR,
   3006                 .name = "compression type",
   3007             },
   3008             {
   3009                 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
   3010                 .bit  = QCOW2_INCOMPAT_EXTL2_BITNR,
   3011                 .name = "extended L2 entries",
   3012             },
   3013             {
   3014                 .type = QCOW2_FEAT_TYPE_COMPATIBLE,
   3015                 .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
   3016                 .name = "lazy refcounts",
   3017             },
   3018             {
   3019                 .type = QCOW2_FEAT_TYPE_AUTOCLEAR,
   3020                 .bit  = QCOW2_AUTOCLEAR_BITMAPS_BITNR,
   3021                 .name = "bitmaps",
   3022             },
   3023             {
   3024                 .type = QCOW2_FEAT_TYPE_AUTOCLEAR,
   3025                 .bit  = QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR,
   3026                 .name = "raw external data",
   3027             },
   3028         };
   3029 
   3030         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
   3031                              features, sizeof(features), buflen);
   3032         if (ret < 0) {
   3033             goto fail;
   3034         }
   3035         buf += ret;
   3036         buflen -= ret;
   3037     }
   3038 
   3039     /* Bitmap extension */
   3040     if (s->nb_bitmaps > 0) {
   3041         Qcow2BitmapHeaderExt bitmaps_header = {
   3042             .nb_bitmaps = cpu_to_be32(s->nb_bitmaps),
   3043             .bitmap_directory_size =
   3044                     cpu_to_be64(s->bitmap_directory_size),
   3045             .bitmap_directory_offset =
   3046                     cpu_to_be64(s->bitmap_directory_offset)
   3047         };
   3048         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BITMAPS,
   3049                              &bitmaps_header, sizeof(bitmaps_header),
   3050                              buflen);
   3051         if (ret < 0) {
   3052             goto fail;
   3053         }
   3054         buf += ret;
   3055         buflen -= ret;
   3056     }
   3057 
   3058     /* Keep unknown header extensions */
   3059     QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
   3060         ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
   3061         if (ret < 0) {
   3062             goto fail;
   3063         }
   3064 
   3065         buf += ret;
   3066         buflen -= ret;
   3067     }
   3068 
   3069     /* End of header extensions */
   3070     ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
   3071     if (ret < 0) {
   3072         goto fail;
   3073     }
   3074 
   3075     buf += ret;
   3076     buflen -= ret;
   3077 
   3078     /* Backing file name */
   3079     if (s->image_backing_file) {
   3080         size_t backing_file_len = strlen(s->image_backing_file);
   3081 
   3082         if (buflen < backing_file_len) {
   3083             ret = -ENOSPC;
   3084             goto fail;
   3085         }
   3086 
   3087         /* Using strncpy is ok here, since buf is not NUL-terminated. */
   3088         strncpy(buf, s->image_backing_file, buflen);
   3089 
   3090         header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
   3091         header->backing_file_size   = cpu_to_be32(backing_file_len);
   3092     }
   3093 
   3094     /* Write the new header */
   3095     ret = bdrv_pwrite(bs->file, 0, s->cluster_size, header, 0);
   3096     if (ret < 0) {
   3097         goto fail;
   3098     }
   3099 
   3100     ret = 0;
   3101 fail:
   3102     qemu_vfree(header);
   3103     return ret;
   3104 }
   3105 
   3106 static int qcow2_change_backing_file(BlockDriverState *bs,
   3107     const char *backing_file, const char *backing_fmt)
   3108 {
   3109     BDRVQcow2State *s = bs->opaque;
   3110 
   3111     /* Adding a backing file means that the external data file alone won't be
   3112      * enough to make sense of the content */
   3113     if (backing_file && data_file_is_raw(bs)) {
   3114         return -EINVAL;
   3115     }
   3116 
   3117     if (backing_file && strlen(backing_file) > 1023) {
   3118         return -EINVAL;
   3119     }
   3120 
   3121     pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
   3122             backing_file ?: "");
   3123     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
   3124     pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
   3125 
   3126     g_free(s->image_backing_file);
   3127     g_free(s->image_backing_format);
   3128 
   3129     s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL;
   3130     s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL;
   3131 
   3132     return qcow2_update_header(bs);
   3133 }
   3134 
   3135 static int qcow2_set_up_encryption(BlockDriverState *bs,
   3136                                    QCryptoBlockCreateOptions *cryptoopts,
   3137                                    Error **errp)
   3138 {
   3139     BDRVQcow2State *s = bs->opaque;
   3140     QCryptoBlock *crypto = NULL;
   3141     int fmt, ret;
   3142 
   3143     switch (cryptoopts->format) {
   3144     case Q_CRYPTO_BLOCK_FORMAT_LUKS:
   3145         fmt = QCOW_CRYPT_LUKS;
   3146         break;
   3147     case Q_CRYPTO_BLOCK_FORMAT_QCOW:
   3148         fmt = QCOW_CRYPT_AES;
   3149         break;
   3150     default:
   3151         error_setg(errp, "Crypto format not supported in qcow2");
   3152         return -EINVAL;
   3153     }
   3154 
   3155     s->crypt_method_header = fmt;
   3156 
   3157     crypto = qcrypto_block_create(cryptoopts, "encrypt.",
   3158                                   qcow2_crypto_hdr_init_func,
   3159                                   qcow2_crypto_hdr_write_func,
   3160                                   bs, errp);
   3161     if (!crypto) {
   3162         return -EINVAL;
   3163     }
   3164 
   3165     ret = qcow2_update_header(bs);
   3166     if (ret < 0) {
   3167         error_setg_errno(errp, -ret, "Could not write encryption header");
   3168         goto out;
   3169     }
   3170 
   3171     ret = 0;
   3172  out:
   3173     qcrypto_block_free(crypto);
   3174     return ret;
   3175 }
   3176 
   3177 /**
   3178  * Preallocates metadata structures for data clusters between @offset (in the
   3179  * guest disk) and @new_length (which is thus generally the new guest disk
   3180  * size).
   3181  *
   3182  * Returns: 0 on success, -errno on failure.
   3183  */
   3184 static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset,
   3185                                        uint64_t new_length, PreallocMode mode,
   3186                                        Error **errp)
   3187 {
   3188     BDRVQcow2State *s = bs->opaque;
   3189     uint64_t bytes;
   3190     uint64_t host_offset = 0;
   3191     int64_t file_length;
   3192     unsigned int cur_bytes;
   3193     int ret;
   3194     QCowL2Meta *meta = NULL, *m;
   3195 
   3196     assert(offset <= new_length);
   3197     bytes = new_length - offset;
   3198 
   3199     while (bytes) {
   3200         cur_bytes = MIN(bytes, QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size));
   3201         ret = qcow2_alloc_host_offset(bs, offset, &cur_bytes,
   3202                                       &host_offset, &meta);
   3203         if (ret < 0) {
   3204             error_setg_errno(errp, -ret, "Allocating clusters failed");
   3205             goto out;
   3206         }
   3207 
   3208         for (m = meta; m != NULL; m = m->next) {
   3209             m->prealloc = true;
   3210         }
   3211 
   3212         ret = qcow2_handle_l2meta(bs, &meta, true);
   3213         if (ret < 0) {
   3214             error_setg_errno(errp, -ret, "Mapping clusters failed");
   3215             goto out;
   3216         }
   3217 
   3218         /* TODO Preallocate data if requested */
   3219 
   3220         bytes -= cur_bytes;
   3221         offset += cur_bytes;
   3222     }
   3223 
   3224     /*
   3225      * It is expected that the image file is large enough to actually contain
   3226      * all of the allocated clusters (otherwise we get failing reads after
   3227      * EOF). Extend the image to the last allocated sector.
   3228      */
   3229     file_length = bdrv_getlength(s->data_file->bs);
   3230     if (file_length < 0) {
   3231         error_setg_errno(errp, -file_length, "Could not get file size");
   3232         ret = file_length;
   3233         goto out;
   3234     }
   3235 
   3236     if (host_offset + cur_bytes > file_length) {
   3237         if (mode == PREALLOC_MODE_METADATA) {
   3238             mode = PREALLOC_MODE_OFF;
   3239         }
   3240         ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, false,
   3241                                mode, 0, errp);
   3242         if (ret < 0) {
   3243             goto out;
   3244         }
   3245     }
   3246 
   3247     ret = 0;
   3248 
   3249 out:
   3250     qcow2_handle_l2meta(bs, &meta, false);
   3251     return ret;
   3252 }
   3253 
   3254 /* qcow2_refcount_metadata_size:
   3255  * @clusters: number of clusters to refcount (including data and L1/L2 tables)
   3256  * @cluster_size: size of a cluster, in bytes
   3257  * @refcount_order: refcount bits power-of-2 exponent
   3258  * @generous_increase: allow for the refcount table to be 1.5x as large as it
   3259  *                     needs to be
   3260  *
   3261  * Returns: Number of bytes required for refcount blocks and table metadata.
   3262  */
   3263 int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
   3264                                      int refcount_order, bool generous_increase,
   3265                                      uint64_t *refblock_count)
   3266 {
   3267     /*
   3268      * Every host cluster is reference-counted, including metadata (even
   3269      * refcount metadata is recursively included).
   3270      *
   3271      * An accurate formula for the size of refcount metadata size is difficult
   3272      * to derive.  An easier method of calculation is finding the fixed point
   3273      * where no further refcount blocks or table clusters are required to
   3274      * reference count every cluster.
   3275      */
   3276     int64_t blocks_per_table_cluster = cluster_size / REFTABLE_ENTRY_SIZE;
   3277     int64_t refcounts_per_block = cluster_size * 8 / (1 << refcount_order);
   3278     int64_t table = 0;  /* number of refcount table clusters */
   3279     int64_t blocks = 0; /* number of refcount block clusters */
   3280     int64_t last;
   3281     int64_t n = 0;
   3282 
   3283     do {
   3284         last = n;
   3285         blocks = DIV_ROUND_UP(clusters + table + blocks, refcounts_per_block);
   3286         table = DIV_ROUND_UP(blocks, blocks_per_table_cluster);
   3287         n = clusters + blocks + table;
   3288 
   3289         if (n == last && generous_increase) {
   3290             clusters += DIV_ROUND_UP(table, 2);
   3291             n = 0; /* force another loop */
   3292             generous_increase = false;
   3293         }
   3294     } while (n != last);
   3295 
   3296     if (refblock_count) {
   3297         *refblock_count = blocks;
   3298     }
   3299 
   3300     return (blocks + table) * cluster_size;
   3301 }
   3302 
   3303 /**
   3304  * qcow2_calc_prealloc_size:
   3305  * @total_size: virtual disk size in bytes
   3306  * @cluster_size: cluster size in bytes
   3307  * @refcount_order: refcount bits power-of-2 exponent
   3308  * @extended_l2: true if the image has extended L2 entries
   3309  *
   3310  * Returns: Total number of bytes required for the fully allocated image
   3311  * (including metadata).
   3312  */
   3313 static int64_t qcow2_calc_prealloc_size(int64_t total_size,
   3314                                         size_t cluster_size,
   3315                                         int refcount_order,
   3316                                         bool extended_l2)
   3317 {
   3318     int64_t meta_size = 0;
   3319     uint64_t nl1e, nl2e;
   3320     int64_t aligned_total_size = ROUND_UP(total_size, cluster_size);
   3321     size_t l2e_size = extended_l2 ? L2E_SIZE_EXTENDED : L2E_SIZE_NORMAL;
   3322 
   3323     /* header: 1 cluster */
   3324     meta_size += cluster_size;
   3325 
   3326     /* total size of L2 tables */
   3327     nl2e = aligned_total_size / cluster_size;
   3328     nl2e = ROUND_UP(nl2e, cluster_size / l2e_size);
   3329     meta_size += nl2e * l2e_size;
   3330 
   3331     /* total size of L1 tables */
   3332     nl1e = nl2e * l2e_size / cluster_size;
   3333     nl1e = ROUND_UP(nl1e, cluster_size / L1E_SIZE);
   3334     meta_size += nl1e * L1E_SIZE;
   3335 
   3336     /* total size of refcount table and blocks */
   3337     meta_size += qcow2_refcount_metadata_size(
   3338             (meta_size + aligned_total_size) / cluster_size,
   3339             cluster_size, refcount_order, false, NULL);
   3340 
   3341     return meta_size + aligned_total_size;
   3342 }
   3343 
   3344 static bool validate_cluster_size(size_t cluster_size, bool extended_l2,
   3345                                   Error **errp)
   3346 {
   3347     int cluster_bits = ctz32(cluster_size);
   3348     if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
   3349         (1 << cluster_bits) != cluster_size)
   3350     {
   3351         error_setg(errp, "Cluster size must be a power of two between %d and "
   3352                    "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
   3353         return false;
   3354     }
   3355 
   3356     if (extended_l2) {
   3357         unsigned min_cluster_size =
   3358             (1 << MIN_CLUSTER_BITS) * QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER;
   3359         if (cluster_size < min_cluster_size) {
   3360             error_setg(errp, "Extended L2 entries are only supported with "
   3361                        "cluster sizes of at least %u bytes", min_cluster_size);
   3362             return false;
   3363         }
   3364     }
   3365 
   3366     return true;
   3367 }
   3368 
   3369 static size_t qcow2_opt_get_cluster_size_del(QemuOpts *opts, bool extended_l2,
   3370                                              Error **errp)
   3371 {
   3372     size_t cluster_size;
   3373 
   3374     cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
   3375                                          DEFAULT_CLUSTER_SIZE);
   3376     if (!validate_cluster_size(cluster_size, extended_l2, errp)) {
   3377         return 0;
   3378     }
   3379     return cluster_size;
   3380 }
   3381 
   3382 static int qcow2_opt_get_version_del(QemuOpts *opts, Error **errp)
   3383 {
   3384     char *buf;
   3385     int ret;
   3386 
   3387     buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL);
   3388     if (!buf) {
   3389         ret = 3; /* default */
   3390     } else if (!strcmp(buf, "0.10")) {
   3391         ret = 2;
   3392     } else if (!strcmp(buf, "1.1")) {
   3393         ret = 3;
   3394     } else {
   3395         error_setg(errp, "Invalid compatibility level: '%s'", buf);
   3396         ret = -EINVAL;
   3397     }
   3398     g_free(buf);
   3399     return ret;
   3400 }
   3401 
   3402 static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version,
   3403                                                 Error **errp)
   3404 {
   3405     uint64_t refcount_bits;
   3406 
   3407     refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, 16);
   3408     if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) {
   3409         error_setg(errp, "Refcount width must be a power of two and may not "
   3410                    "exceed 64 bits");
   3411         return 0;
   3412     }
   3413 
   3414     if (version < 3 && refcount_bits != 16) {
   3415         error_setg(errp, "Different refcount widths than 16 bits require "
   3416                    "compatibility level 1.1 or above (use compat=1.1 or "
   3417                    "greater)");
   3418         return 0;
   3419     }
   3420 
   3421     return refcount_bits;
   3422 }
   3423 
   3424 static int coroutine_fn
   3425 qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
   3426 {
   3427     BlockdevCreateOptionsQcow2 *qcow2_opts;
   3428     QDict *options;
   3429 
   3430     /*
   3431      * Open the image file and write a minimal qcow2 header.
   3432      *
   3433      * We keep things simple and start with a zero-sized image. We also
   3434      * do without refcount blocks or a L1 table for now. We'll fix the
   3435      * inconsistency later.
   3436      *
   3437      * We do need a refcount table because growing the refcount table means
   3438      * allocating two new refcount blocks - the second of which would be at
   3439      * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
   3440      * size for any qcow2 image.
   3441      */
   3442     BlockBackend *blk = NULL;
   3443     BlockDriverState *bs = NULL;
   3444     BlockDriverState *data_bs = NULL;
   3445     QCowHeader *header;
   3446     size_t cluster_size;
   3447     int version;
   3448     int refcount_order;
   3449     uint64_t *refcount_table;
   3450     int ret;
   3451     uint8_t compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
   3452 
   3453     assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
   3454     qcow2_opts = &create_options->u.qcow2;
   3455 
   3456     bs = bdrv_open_blockdev_ref(qcow2_opts->file, errp);
   3457     if (bs == NULL) {
   3458         return -EIO;
   3459     }
   3460 
   3461     /* Validate options and set default values */
   3462     if (!QEMU_IS_ALIGNED(qcow2_opts->size, BDRV_SECTOR_SIZE)) {
   3463         error_setg(errp, "Image size must be a multiple of %u bytes",
   3464                    (unsigned) BDRV_SECTOR_SIZE);
   3465         ret = -EINVAL;
   3466         goto out;
   3467     }
   3468 
   3469     if (qcow2_opts->has_version) {
   3470         switch (qcow2_opts->version) {
   3471         case BLOCKDEV_QCOW2_VERSION_V2:
   3472             version = 2;
   3473             break;
   3474         case BLOCKDEV_QCOW2_VERSION_V3:
   3475             version = 3;
   3476             break;
   3477         default:
   3478             g_assert_not_reached();
   3479         }
   3480     } else {
   3481         version = 3;
   3482     }
   3483 
   3484     if (qcow2_opts->has_cluster_size) {
   3485         cluster_size = qcow2_opts->cluster_size;
   3486     } else {
   3487         cluster_size = DEFAULT_CLUSTER_SIZE;
   3488     }
   3489 
   3490     if (!qcow2_opts->has_extended_l2) {
   3491         qcow2_opts->extended_l2 = false;
   3492     }
   3493     if (qcow2_opts->extended_l2) {
   3494         if (version < 3) {
   3495             error_setg(errp, "Extended L2 entries are only supported with "
   3496                        "compatibility level 1.1 and above (use version=v3 or "
   3497                        "greater)");
   3498             ret = -EINVAL;
   3499             goto out;
   3500         }
   3501     }
   3502 
   3503     if (!validate_cluster_size(cluster_size, qcow2_opts->extended_l2, errp)) {
   3504         ret = -EINVAL;
   3505         goto out;
   3506     }
   3507 
   3508     if (!qcow2_opts->has_preallocation) {
   3509         qcow2_opts->preallocation = PREALLOC_MODE_OFF;
   3510     }
   3511     if (qcow2_opts->has_backing_file &&
   3512         qcow2_opts->preallocation != PREALLOC_MODE_OFF &&
   3513         !qcow2_opts->extended_l2)
   3514     {
   3515         error_setg(errp, "Backing file and preallocation can only be used at "
   3516                    "the same time if extended_l2 is on");
   3517         ret = -EINVAL;
   3518         goto out;
   3519     }
   3520     if (qcow2_opts->has_backing_fmt && !qcow2_opts->has_backing_file) {
   3521         error_setg(errp, "Backing format cannot be used without backing file");
   3522         ret = -EINVAL;
   3523         goto out;
   3524     }
   3525 
   3526     if (!qcow2_opts->has_lazy_refcounts) {
   3527         qcow2_opts->lazy_refcounts = false;
   3528     }
   3529     if (version < 3 && qcow2_opts->lazy_refcounts) {
   3530         error_setg(errp, "Lazy refcounts only supported with compatibility "
   3531                    "level 1.1 and above (use version=v3 or greater)");
   3532         ret = -EINVAL;
   3533         goto out;
   3534     }
   3535 
   3536     if (!qcow2_opts->has_refcount_bits) {
   3537         qcow2_opts->refcount_bits = 16;
   3538     }
   3539     if (qcow2_opts->refcount_bits > 64 ||
   3540         !is_power_of_2(qcow2_opts->refcount_bits))
   3541     {
   3542         error_setg(errp, "Refcount width must be a power of two and may not "
   3543                    "exceed 64 bits");
   3544         ret = -EINVAL;
   3545         goto out;
   3546     }
   3547     if (version < 3 && qcow2_opts->refcount_bits != 16) {
   3548         error_setg(errp, "Different refcount widths than 16 bits require "
   3549                    "compatibility level 1.1 or above (use version=v3 or "
   3550                    "greater)");
   3551         ret = -EINVAL;
   3552         goto out;
   3553     }
   3554     refcount_order = ctz32(qcow2_opts->refcount_bits);
   3555 
   3556     if (qcow2_opts->data_file_raw && !qcow2_opts->data_file) {
   3557         error_setg(errp, "data-file-raw requires data-file");
   3558         ret = -EINVAL;
   3559         goto out;
   3560     }
   3561     if (qcow2_opts->data_file_raw && qcow2_opts->has_backing_file) {
   3562         error_setg(errp, "Backing file and data-file-raw cannot be used at "
   3563                    "the same time");
   3564         ret = -EINVAL;
   3565         goto out;
   3566     }
   3567     if (qcow2_opts->data_file_raw &&
   3568         qcow2_opts->preallocation == PREALLOC_MODE_OFF)
   3569     {
   3570         /*
   3571          * data-file-raw means that "the external data file can be
   3572          * read as a consistent standalone raw image without looking
   3573          * at the qcow2 metadata."  It does not say that the metadata
   3574          * must be ignored, though (and the qcow2 driver in fact does
   3575          * not ignore it), so the L1/L2 tables must be present and
   3576          * give a 1:1 mapping, so you get the same result regardless
   3577          * of whether you look at the metadata or whether you ignore
   3578          * it.
   3579          */
   3580         qcow2_opts->preallocation = PREALLOC_MODE_METADATA;
   3581 
   3582         /*
   3583          * Cannot use preallocation with backing files, but giving a
   3584          * backing file when specifying data_file_raw is an error
   3585          * anyway.
   3586          */
   3587         assert(!qcow2_opts->has_backing_file);
   3588     }
   3589 
   3590     if (qcow2_opts->data_file) {
   3591         if (version < 3) {
   3592             error_setg(errp, "External data files are only supported with "
   3593                        "compatibility level 1.1 and above (use version=v3 or "
   3594                        "greater)");
   3595             ret = -EINVAL;
   3596             goto out;
   3597         }
   3598         data_bs = bdrv_open_blockdev_ref(qcow2_opts->data_file, errp);
   3599         if (data_bs == NULL) {
   3600             ret = -EIO;
   3601             goto out;
   3602         }
   3603     }
   3604 
   3605     if (qcow2_opts->has_compression_type &&
   3606         qcow2_opts->compression_type != QCOW2_COMPRESSION_TYPE_ZLIB) {
   3607 
   3608         ret = -EINVAL;
   3609 
   3610         if (version < 3) {
   3611             error_setg(errp, "Non-zlib compression type is only supported with "
   3612                        "compatibility level 1.1 and above (use version=v3 or "
   3613                        "greater)");
   3614             goto out;
   3615         }
   3616 
   3617         switch (qcow2_opts->compression_type) {
   3618 #ifdef CONFIG_ZSTD
   3619         case QCOW2_COMPRESSION_TYPE_ZSTD:
   3620             break;
   3621 #endif
   3622         default:
   3623             error_setg(errp, "Unknown compression type");
   3624             goto out;
   3625         }
   3626 
   3627         compression_type = qcow2_opts->compression_type;
   3628     }
   3629 
   3630     /* Create BlockBackend to write to the image */
   3631     blk = blk_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
   3632                           errp);
   3633     if (!blk) {
   3634         ret = -EPERM;
   3635         goto out;
   3636     }
   3637     blk_set_allow_write_beyond_eof(blk, true);
   3638 
   3639     /* Write the header */
   3640     QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
   3641     header = g_malloc0(cluster_size);
   3642     *header = (QCowHeader) {
   3643         .magic                      = cpu_to_be32(QCOW_MAGIC),
   3644         .version                    = cpu_to_be32(version),
   3645         .cluster_bits               = cpu_to_be32(ctz32(cluster_size)),
   3646         .size                       = cpu_to_be64(0),
   3647         .l1_table_offset            = cpu_to_be64(0),
   3648         .l1_size                    = cpu_to_be32(0),
   3649         .refcount_table_offset      = cpu_to_be64(cluster_size),
   3650         .refcount_table_clusters    = cpu_to_be32(1),
   3651         .refcount_order             = cpu_to_be32(refcount_order),
   3652         /* don't deal with endianness since compression_type is 1 byte long */
   3653         .compression_type           = compression_type,
   3654         .header_length              = cpu_to_be32(sizeof(*header)),
   3655     };
   3656 
   3657     /* We'll update this to correct value later */
   3658     header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
   3659 
   3660     if (qcow2_opts->lazy_refcounts) {
   3661         header->compatible_features |=
   3662             cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
   3663     }
   3664     if (data_bs) {
   3665         header->incompatible_features |=
   3666             cpu_to_be64(QCOW2_INCOMPAT_DATA_FILE);
   3667     }
   3668     if (qcow2_opts->data_file_raw) {
   3669         header->autoclear_features |=
   3670             cpu_to_be64(QCOW2_AUTOCLEAR_DATA_FILE_RAW);
   3671     }
   3672     if (compression_type != QCOW2_COMPRESSION_TYPE_ZLIB) {
   3673         header->incompatible_features |=
   3674             cpu_to_be64(QCOW2_INCOMPAT_COMPRESSION);
   3675     }
   3676 
   3677     if (qcow2_opts->extended_l2) {
   3678         header->incompatible_features |=
   3679             cpu_to_be64(QCOW2_INCOMPAT_EXTL2);
   3680     }
   3681 
   3682     ret = blk_co_pwrite(blk, 0, cluster_size, header, 0);
   3683     g_free(header);
   3684     if (ret < 0) {
   3685         error_setg_errno(errp, -ret, "Could not write qcow2 header");
   3686         goto out;
   3687     }
   3688 
   3689     /* Write a refcount table with one refcount block */
   3690     refcount_table = g_malloc0(2 * cluster_size);
   3691     refcount_table[0] = cpu_to_be64(2 * cluster_size);
   3692     ret = blk_co_pwrite(blk, cluster_size, 2 * cluster_size, refcount_table, 0);
   3693     g_free(refcount_table);
   3694 
   3695     if (ret < 0) {
   3696         error_setg_errno(errp, -ret, "Could not write refcount table");
   3697         goto out;
   3698     }
   3699 
   3700     blk_unref(blk);
   3701     blk = NULL;
   3702 
   3703     /*
   3704      * And now open the image and make it consistent first (i.e. increase the
   3705      * refcount of the cluster that is occupied by the header and the refcount
   3706      * table)
   3707      */
   3708     options = qdict_new();
   3709     qdict_put_str(options, "driver", "qcow2");
   3710     qdict_put_str(options, "file", bs->node_name);
   3711     if (data_bs) {
   3712         qdict_put_str(options, "data-file", data_bs->node_name);
   3713     }
   3714     blk = blk_new_open(NULL, NULL, options,
   3715                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH,
   3716                        errp);
   3717     if (blk == NULL) {
   3718         ret = -EIO;
   3719         goto out;
   3720     }
   3721 
   3722     ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size);
   3723     if (ret < 0) {
   3724         error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
   3725                          "header and refcount table");
   3726         goto out;
   3727 
   3728     } else if (ret != 0) {
   3729         error_report("Huh, first cluster in empty image is already in use?");
   3730         abort();
   3731     }
   3732 
   3733     /* Set the external data file if necessary */
   3734     if (data_bs) {
   3735         BDRVQcow2State *s = blk_bs(blk)->opaque;
   3736         s->image_data_file = g_strdup(data_bs->filename);
   3737     }
   3738 
   3739     /* Create a full header (including things like feature table) */
   3740     ret = qcow2_update_header(blk_bs(blk));
   3741     if (ret < 0) {
   3742         error_setg_errno(errp, -ret, "Could not update qcow2 header");
   3743         goto out;
   3744     }
   3745 
   3746     /* Okay, now that we have a valid image, let's give it the right size */
   3747     ret = blk_co_truncate(blk, qcow2_opts->size, false,
   3748                           qcow2_opts->preallocation, 0, errp);
   3749     if (ret < 0) {
   3750         error_prepend(errp, "Could not resize image: ");
   3751         goto out;
   3752     }
   3753 
   3754     /* Want a backing file? There you go. */
   3755     if (qcow2_opts->has_backing_file) {
   3756         const char *backing_format = NULL;
   3757 
   3758         if (qcow2_opts->has_backing_fmt) {
   3759             backing_format = BlockdevDriver_str(qcow2_opts->backing_fmt);
   3760         }
   3761 
   3762         ret = bdrv_change_backing_file(blk_bs(blk), qcow2_opts->backing_file,
   3763                                        backing_format, false);
   3764         if (ret < 0) {
   3765             error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
   3766                              "with format '%s'", qcow2_opts->backing_file,
   3767                              backing_format);
   3768             goto out;
   3769         }
   3770     }
   3771 
   3772     /* Want encryption? There you go. */
   3773     if (qcow2_opts->has_encrypt) {
   3774         ret = qcow2_set_up_encryption(blk_bs(blk), qcow2_opts->encrypt, errp);
   3775         if (ret < 0) {
   3776             goto out;
   3777         }
   3778     }
   3779 
   3780     blk_unref(blk);
   3781     blk = NULL;
   3782 
   3783     /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning.
   3784      * Using BDRV_O_NO_IO, since encryption is now setup we don't want to
   3785      * have to setup decryption context. We're not doing any I/O on the top
   3786      * level BlockDriverState, only lower layers, where BDRV_O_NO_IO does
   3787      * not have effect.
   3788      */
   3789     options = qdict_new();
   3790     qdict_put_str(options, "driver", "qcow2");
   3791     qdict_put_str(options, "file", bs->node_name);
   3792     if (data_bs) {
   3793         qdict_put_str(options, "data-file", data_bs->node_name);
   3794     }
   3795     blk = blk_new_open(NULL, NULL, options,
   3796                        BDRV_O_RDWR | BDRV_O_NO_BACKING | BDRV_O_NO_IO,
   3797                        errp);
   3798     if (blk == NULL) {
   3799         ret = -EIO;
   3800         goto out;
   3801     }
   3802 
   3803     ret = 0;
   3804 out:
   3805     blk_unref(blk);
   3806     bdrv_unref(bs);
   3807     bdrv_unref(data_bs);
   3808     return ret;
   3809 }
   3810 
   3811 static int coroutine_fn qcow2_co_create_opts(BlockDriver *drv,
   3812                                              const char *filename,
   3813                                              QemuOpts *opts,
   3814                                              Error **errp)
   3815 {
   3816     BlockdevCreateOptions *create_options = NULL;
   3817     QDict *qdict;
   3818     Visitor *v;
   3819     BlockDriverState *bs = NULL;
   3820     BlockDriverState *data_bs = NULL;
   3821     const char *val;
   3822     int ret;
   3823 
   3824     /* Only the keyval visitor supports the dotted syntax needed for
   3825      * encryption, so go through a QDict before getting a QAPI type. Ignore
   3826      * options meant for the protocol layer so that the visitor doesn't
   3827      * complain. */
   3828     qdict = qemu_opts_to_qdict_filtered(opts, NULL, bdrv_qcow2.create_opts,
   3829                                         true);
   3830 
   3831     /* Handle encryption options */
   3832     val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT);
   3833     if (val && !strcmp(val, "on")) {
   3834         qdict_put_str(qdict, BLOCK_OPT_ENCRYPT, "qcow");
   3835     } else if (val && !strcmp(val, "off")) {
   3836         qdict_del(qdict, BLOCK_OPT_ENCRYPT);
   3837     }
   3838 
   3839     val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT);
   3840     if (val && !strcmp(val, "aes")) {
   3841         qdict_put_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT, "qcow");
   3842     }
   3843 
   3844     /* Convert compat=0.10/1.1 into compat=v2/v3, to be renamed into
   3845      * version=v2/v3 below. */
   3846     val = qdict_get_try_str(qdict, BLOCK_OPT_COMPAT_LEVEL);
   3847     if (val && !strcmp(val, "0.10")) {
   3848         qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v2");
   3849     } else if (val && !strcmp(val, "1.1")) {
   3850         qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v3");
   3851     }
   3852 
   3853     /* Change legacy command line options into QMP ones */
   3854     static const QDictRenames opt_renames[] = {
   3855         { BLOCK_OPT_BACKING_FILE,       "backing-file" },
   3856         { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
   3857         { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
   3858         { BLOCK_OPT_LAZY_REFCOUNTS,     "lazy-refcounts" },
   3859         { BLOCK_OPT_EXTL2,              "extended-l2" },
   3860         { BLOCK_OPT_REFCOUNT_BITS,      "refcount-bits" },
   3861         { BLOCK_OPT_ENCRYPT,            BLOCK_OPT_ENCRYPT_FORMAT },
   3862         { BLOCK_OPT_COMPAT_LEVEL,       "version" },
   3863         { BLOCK_OPT_DATA_FILE_RAW,      "data-file-raw" },
   3864         { BLOCK_OPT_COMPRESSION_TYPE,   "compression-type" },
   3865         { NULL, NULL },
   3866     };
   3867 
   3868     if (!qdict_rename_keys(qdict, opt_renames, errp)) {
   3869         ret = -EINVAL;
   3870         goto finish;
   3871     }
   3872 
   3873     /* Create and open the file (protocol layer) */
   3874     ret = bdrv_create_file(filename, opts, errp);
   3875     if (ret < 0) {
   3876         goto finish;
   3877     }
   3878 
   3879     bs = bdrv_open(filename, NULL, NULL,
   3880                    BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
   3881     if (bs == NULL) {
   3882         ret = -EIO;
   3883         goto finish;
   3884     }
   3885 
   3886     /* Create and open an external data file (protocol layer) */
   3887     val = qdict_get_try_str(qdict, BLOCK_OPT_DATA_FILE);
   3888     if (val) {
   3889         ret = bdrv_create_file(val, opts, errp);
   3890         if (ret < 0) {
   3891             goto finish;
   3892         }
   3893 
   3894         data_bs = bdrv_open(val, NULL, NULL,
   3895                             BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
   3896                             errp);
   3897         if (data_bs == NULL) {
   3898             ret = -EIO;
   3899             goto finish;
   3900         }
   3901 
   3902         qdict_del(qdict, BLOCK_OPT_DATA_FILE);
   3903         qdict_put_str(qdict, "data-file", data_bs->node_name);
   3904     }
   3905 
   3906     /* Set 'driver' and 'node' options */
   3907     qdict_put_str(qdict, "driver", "qcow2");
   3908     qdict_put_str(qdict, "file", bs->node_name);
   3909 
   3910     /* Now get the QAPI type BlockdevCreateOptions */
   3911     v = qobject_input_visitor_new_flat_confused(qdict, errp);
   3912     if (!v) {
   3913         ret = -EINVAL;
   3914         goto finish;
   3915     }
   3916 
   3917     visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
   3918     visit_free(v);
   3919     if (!create_options) {
   3920         ret = -EINVAL;
   3921         goto finish;
   3922     }
   3923 
   3924     /* Silently round up size */
   3925     create_options->u.qcow2.size = ROUND_UP(create_options->u.qcow2.size,
   3926                                             BDRV_SECTOR_SIZE);
   3927 
   3928     /* Create the qcow2 image (format layer) */
   3929     ret = qcow2_co_create(create_options, errp);
   3930 finish:
   3931     if (ret < 0) {
   3932         bdrv_co_delete_file_noerr(bs);
   3933         bdrv_co_delete_file_noerr(data_bs);
   3934     } else {
   3935         ret = 0;
   3936     }
   3937 
   3938     qobject_unref(qdict);
   3939     bdrv_unref(bs);
   3940     bdrv_unref(data_bs);
   3941     qapi_free_BlockdevCreateOptions(create_options);
   3942     return ret;
   3943 }
   3944 
   3945 
   3946 static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)
   3947 {
   3948     int64_t nr;
   3949     int res;
   3950 
   3951     /* Clamp to image length, before checking status of underlying sectors */
   3952     if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
   3953         bytes = bs->total_sectors * BDRV_SECTOR_SIZE - offset;
   3954     }
   3955 
   3956     if (!bytes) {
   3957         return true;
   3958     }
   3959 
   3960     /*
   3961      * bdrv_block_status_above doesn't merge different types of zeros, for
   3962      * example, zeros which come from the region which is unallocated in
   3963      * the whole backing chain, and zeros which come because of a short
   3964      * backing file. So, we need a loop.
   3965      */
   3966     do {
   3967         res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
   3968         offset += nr;
   3969         bytes -= nr;
   3970     } while (res >= 0 && (res & BDRV_BLOCK_ZERO) && nr && bytes);
   3971 
   3972     return res >= 0 && (res & BDRV_BLOCK_ZERO) && bytes == 0;
   3973 }
   3974 
   3975 static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
   3976     int64_t offset, int64_t bytes, BdrvRequestFlags flags)
   3977 {
   3978     int ret;
   3979     BDRVQcow2State *s = bs->opaque;
   3980 
   3981     uint32_t head = offset_into_subcluster(s, offset);
   3982     uint32_t tail = ROUND_UP(offset + bytes, s->subcluster_size) -
   3983         (offset + bytes);
   3984 
   3985     trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, bytes);
   3986     if (offset + bytes == bs->total_sectors * BDRV_SECTOR_SIZE) {
   3987         tail = 0;
   3988     }
   3989 
   3990     if (head || tail) {
   3991         uint64_t off;
   3992         unsigned int nr;
   3993         QCow2SubclusterType type;
   3994 
   3995         assert(head + bytes + tail <= s->subcluster_size);
   3996 
   3997         /* check whether remainder of cluster already reads as zero */
   3998         if (!(is_zero(bs, offset - head, head) &&
   3999               is_zero(bs, offset + bytes, tail))) {
   4000             return -ENOTSUP;
   4001         }
   4002 
   4003         qemu_co_mutex_lock(&s->lock);
   4004         /* We can have new write after previous check */
   4005         offset -= head;
   4006         bytes = s->subcluster_size;
   4007         nr = s->subcluster_size;
   4008         ret = qcow2_get_host_offset(bs, offset, &nr, &off, &type);
   4009         if (ret < 0 ||
   4010             (type != QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN &&
   4011              type != QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC &&
   4012              type != QCOW2_SUBCLUSTER_ZERO_PLAIN &&
   4013              type != QCOW2_SUBCLUSTER_ZERO_ALLOC)) {
   4014             qemu_co_mutex_unlock(&s->lock);
   4015             return ret < 0 ? ret : -ENOTSUP;
   4016         }
   4017     } else {
   4018         qemu_co_mutex_lock(&s->lock);
   4019     }
   4020 
   4021     trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, bytes);
   4022 
   4023     /* Whatever is left can use real zero subclusters */
   4024     ret = qcow2_subcluster_zeroize(bs, offset, bytes, flags);
   4025     qemu_co_mutex_unlock(&s->lock);
   4026 
   4027     return ret;
   4028 }
   4029 
   4030 static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
   4031                                           int64_t offset, int64_t bytes)
   4032 {
   4033     int ret;
   4034     BDRVQcow2State *s = bs->opaque;
   4035 
   4036     /* If the image does not support QCOW_OFLAG_ZERO then discarding
   4037      * clusters could expose stale data from the backing file. */
   4038     if (s->qcow_version < 3 && bs->backing) {
   4039         return -ENOTSUP;
   4040     }
   4041 
   4042     if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) {
   4043         assert(bytes < s->cluster_size);
   4044         /* Ignore partial clusters, except for the special case of the
   4045          * complete partial cluster at the end of an unaligned file */
   4046         if (!QEMU_IS_ALIGNED(offset, s->cluster_size) ||
   4047             offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) {
   4048             return -ENOTSUP;
   4049         }
   4050     }
   4051 
   4052     qemu_co_mutex_lock(&s->lock);
   4053     ret = qcow2_cluster_discard(bs, offset, bytes, QCOW2_DISCARD_REQUEST,
   4054                                 false);
   4055     qemu_co_mutex_unlock(&s->lock);
   4056     return ret;
   4057 }
   4058 
   4059 static int coroutine_fn
   4060 qcow2_co_copy_range_from(BlockDriverState *bs,
   4061                          BdrvChild *src, int64_t src_offset,
   4062                          BdrvChild *dst, int64_t dst_offset,
   4063                          int64_t bytes, BdrvRequestFlags read_flags,
   4064                          BdrvRequestFlags write_flags)
   4065 {
   4066     BDRVQcow2State *s = bs->opaque;
   4067     int ret;
   4068     unsigned int cur_bytes; /* number of bytes in current iteration */
   4069     BdrvChild *child = NULL;
   4070     BdrvRequestFlags cur_write_flags;
   4071 
   4072     assert(!bs->encrypted);
   4073     qemu_co_mutex_lock(&s->lock);
   4074 
   4075     while (bytes != 0) {
   4076         uint64_t copy_offset = 0;
   4077         QCow2SubclusterType type;
   4078         /* prepare next request */
   4079         cur_bytes = MIN(bytes, INT_MAX);
   4080         cur_write_flags = write_flags;
   4081 
   4082         ret = qcow2_get_host_offset(bs, src_offset, &cur_bytes,
   4083                                     &copy_offset, &type);
   4084         if (ret < 0) {
   4085             goto out;
   4086         }
   4087 
   4088         switch (type) {
   4089         case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
   4090         case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
   4091             if (bs->backing && bs->backing->bs) {
   4092                 int64_t backing_length = bdrv_getlength(bs->backing->bs);
   4093                 if (src_offset >= backing_length) {
   4094                     cur_write_flags |= BDRV_REQ_ZERO_WRITE;
   4095                 } else {
   4096                     child = bs->backing;
   4097                     cur_bytes = MIN(cur_bytes, backing_length - src_offset);
   4098                     copy_offset = src_offset;
   4099                 }
   4100             } else {
   4101                 cur_write_flags |= BDRV_REQ_ZERO_WRITE;
   4102             }
   4103             break;
   4104 
   4105         case QCOW2_SUBCLUSTER_ZERO_PLAIN:
   4106         case QCOW2_SUBCLUSTER_ZERO_ALLOC:
   4107             cur_write_flags |= BDRV_REQ_ZERO_WRITE;
   4108             break;
   4109 
   4110         case QCOW2_SUBCLUSTER_COMPRESSED:
   4111             ret = -ENOTSUP;
   4112             goto out;
   4113 
   4114         case QCOW2_SUBCLUSTER_NORMAL:
   4115             child = s->data_file;
   4116             break;
   4117 
   4118         default:
   4119             abort();
   4120         }
   4121         qemu_co_mutex_unlock(&s->lock);
   4122         ret = bdrv_co_copy_range_from(child,
   4123                                       copy_offset,
   4124                                       dst, dst_offset,
   4125                                       cur_bytes, read_flags, cur_write_flags);
   4126         qemu_co_mutex_lock(&s->lock);
   4127         if (ret < 0) {
   4128             goto out;
   4129         }
   4130 
   4131         bytes -= cur_bytes;
   4132         src_offset += cur_bytes;
   4133         dst_offset += cur_bytes;
   4134     }
   4135     ret = 0;
   4136 
   4137 out:
   4138     qemu_co_mutex_unlock(&s->lock);
   4139     return ret;
   4140 }
   4141 
   4142 static int coroutine_fn
   4143 qcow2_co_copy_range_to(BlockDriverState *bs,
   4144                        BdrvChild *src, int64_t src_offset,
   4145                        BdrvChild *dst, int64_t dst_offset,
   4146                        int64_t bytes, BdrvRequestFlags read_flags,
   4147                        BdrvRequestFlags write_flags)
   4148 {
   4149     BDRVQcow2State *s = bs->opaque;
   4150     int ret;
   4151     unsigned int cur_bytes; /* number of sectors in current iteration */
   4152     uint64_t host_offset;
   4153     QCowL2Meta *l2meta = NULL;
   4154 
   4155     assert(!bs->encrypted);
   4156 
   4157     qemu_co_mutex_lock(&s->lock);
   4158 
   4159     while (bytes != 0) {
   4160 
   4161         l2meta = NULL;
   4162 
   4163         cur_bytes = MIN(bytes, INT_MAX);
   4164 
   4165         /* TODO:
   4166          * If src->bs == dst->bs, we could simply copy by incrementing
   4167          * the refcnt, without copying user data.
   4168          * Or if src->bs == dst->bs->backing->bs, we could copy by discarding. */
   4169         ret = qcow2_alloc_host_offset(bs, dst_offset, &cur_bytes,
   4170                                       &host_offset, &l2meta);
   4171         if (ret < 0) {
   4172             goto fail;
   4173         }
   4174 
   4175         ret = qcow2_pre_write_overlap_check(bs, 0, host_offset, cur_bytes,
   4176                                             true);
   4177         if (ret < 0) {
   4178             goto fail;
   4179         }
   4180 
   4181         qemu_co_mutex_unlock(&s->lock);
   4182         ret = bdrv_co_copy_range_to(src, src_offset, s->data_file, host_offset,
   4183                                     cur_bytes, read_flags, write_flags);
   4184         qemu_co_mutex_lock(&s->lock);
   4185         if (ret < 0) {
   4186             goto fail;
   4187         }
   4188 
   4189         ret = qcow2_handle_l2meta(bs, &l2meta, true);
   4190         if (ret) {
   4191             goto fail;
   4192         }
   4193 
   4194         bytes -= cur_bytes;
   4195         src_offset += cur_bytes;
   4196         dst_offset += cur_bytes;
   4197     }
   4198     ret = 0;
   4199 
   4200 fail:
   4201     qcow2_handle_l2meta(bs, &l2meta, false);
   4202 
   4203     qemu_co_mutex_unlock(&s->lock);
   4204 
   4205     trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
   4206 
   4207     return ret;
   4208 }
   4209 
   4210 static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
   4211                                           bool exact, PreallocMode prealloc,
   4212                                           BdrvRequestFlags flags, Error **errp)
   4213 {
   4214     BDRVQcow2State *s = bs->opaque;
   4215     uint64_t old_length;
   4216     int64_t new_l1_size;
   4217     int ret;
   4218     QDict *options;
   4219 
   4220     if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA &&
   4221         prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL)
   4222     {
   4223         error_setg(errp, "Unsupported preallocation mode '%s'",
   4224                    PreallocMode_str(prealloc));
   4225         return -ENOTSUP;
   4226     }
   4227 
   4228     if (!QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)) {
   4229         error_setg(errp, "The new size must be a multiple of %u",
   4230                    (unsigned) BDRV_SECTOR_SIZE);
   4231         return -EINVAL;
   4232     }
   4233 
   4234     qemu_co_mutex_lock(&s->lock);
   4235 
   4236     /*
   4237      * Even though we store snapshot size for all images, it was not
   4238      * required until v3, so it is not safe to proceed for v2.
   4239      */
   4240     if (s->nb_snapshots && s->qcow_version < 3) {
   4241         error_setg(errp, "Can't resize a v2 image which has snapshots");
   4242         ret = -ENOTSUP;
   4243         goto fail;
   4244     }
   4245 
   4246     /* See qcow2-bitmap.c for which bitmap scenarios prevent a resize. */
   4247     if (qcow2_truncate_bitmaps_check(bs, errp)) {
   4248         ret = -ENOTSUP;
   4249         goto fail;
   4250     }
   4251 
   4252     old_length = bs->total_sectors * BDRV_SECTOR_SIZE;
   4253     new_l1_size = size_to_l1(s, offset);
   4254 
   4255     if (offset < old_length) {
   4256         int64_t last_cluster, old_file_size;
   4257         if (prealloc != PREALLOC_MODE_OFF) {
   4258             error_setg(errp,
   4259                        "Preallocation can't be used for shrinking an image");
   4260             ret = -EINVAL;
   4261             goto fail;
   4262         }
   4263 
   4264         ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
   4265                                     old_length - ROUND_UP(offset,
   4266                                                           s->cluster_size),
   4267                                     QCOW2_DISCARD_ALWAYS, true);
   4268         if (ret < 0) {
   4269             error_setg_errno(errp, -ret, "Failed to discard cropped clusters");
   4270             goto fail;
   4271         }
   4272 
   4273         ret = qcow2_shrink_l1_table(bs, new_l1_size);
   4274         if (ret < 0) {
   4275             error_setg_errno(errp, -ret,
   4276                              "Failed to reduce the number of L2 tables");
   4277             goto fail;
   4278         }
   4279 
   4280         ret = qcow2_shrink_reftable(bs);
   4281         if (ret < 0) {
   4282             error_setg_errno(errp, -ret,
   4283                              "Failed to discard unused refblocks");
   4284             goto fail;
   4285         }
   4286 
   4287         old_file_size = bdrv_getlength(bs->file->bs);
   4288         if (old_file_size < 0) {
   4289             error_setg_errno(errp, -old_file_size,
   4290                              "Failed to inquire current file length");
   4291             ret = old_file_size;
   4292             goto fail;
   4293         }
   4294         last_cluster = qcow2_get_last_cluster(bs, old_file_size);
   4295         if (last_cluster < 0) {
   4296             error_setg_errno(errp, -last_cluster,
   4297                              "Failed to find the last cluster");
   4298             ret = last_cluster;
   4299             goto fail;
   4300         }
   4301         if ((last_cluster + 1) * s->cluster_size < old_file_size) {
   4302             Error *local_err = NULL;
   4303 
   4304             /*
   4305              * Do not pass @exact here: It will not help the user if
   4306              * we get an error here just because they wanted to shrink
   4307              * their qcow2 image (on a block device) with qemu-img.
   4308              * (And on the qcow2 layer, the @exact requirement is
   4309              * always fulfilled, so there is no need to pass it on.)
   4310              */
   4311             bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
   4312                              false, PREALLOC_MODE_OFF, 0, &local_err);
   4313             if (local_err) {
   4314                 warn_reportf_err(local_err,
   4315                                  "Failed to truncate the tail of the image: ");
   4316             }
   4317         }
   4318     } else {
   4319         ret = qcow2_grow_l1_table(bs, new_l1_size, true);
   4320         if (ret < 0) {
   4321             error_setg_errno(errp, -ret, "Failed to grow the L1 table");
   4322             goto fail;
   4323         }
   4324 
   4325         if (data_file_is_raw(bs) && prealloc == PREALLOC_MODE_OFF) {
   4326             /*
   4327              * When creating a qcow2 image with data-file-raw, we enforce
   4328              * at least prealloc=metadata, so that the L1/L2 tables are
   4329              * fully allocated and reading from the data file will return
   4330              * the same data as reading from the qcow2 image.  When the
   4331              * image is grown, we must consequently preallocate the
   4332              * metadata structures to cover the added area.
   4333              */
   4334             prealloc = PREALLOC_MODE_METADATA;
   4335         }
   4336     }
   4337 
   4338     switch (prealloc) {
   4339     case PREALLOC_MODE_OFF:
   4340         if (has_data_file(bs)) {
   4341             /*
   4342              * If the caller wants an exact resize, the external data
   4343              * file should be resized to the exact target size, too,
   4344              * so we pass @exact here.
   4345              */
   4346             ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, 0,
   4347                                    errp);
   4348             if (ret < 0) {
   4349                 goto fail;
   4350             }
   4351         }
   4352         break;
   4353 
   4354     case PREALLOC_MODE_METADATA:
   4355         ret = preallocate_co(bs, old_length, offset, prealloc, errp);
   4356         if (ret < 0) {
   4357             goto fail;
   4358         }
   4359         break;
   4360 
   4361     case PREALLOC_MODE_FALLOC:
   4362     case PREALLOC_MODE_FULL:
   4363     {
   4364         int64_t allocation_start, host_offset, guest_offset;
   4365         int64_t clusters_allocated;
   4366         int64_t old_file_size, last_cluster, new_file_size;
   4367         uint64_t nb_new_data_clusters, nb_new_l2_tables;
   4368         bool subclusters_need_allocation = false;
   4369 
   4370         /* With a data file, preallocation means just allocating the metadata
   4371          * and forwarding the truncate request to the data file */
   4372         if (has_data_file(bs)) {
   4373             ret = preallocate_co(bs, old_length, offset, prealloc, errp);
   4374             if (ret < 0) {
   4375                 goto fail;
   4376             }
   4377             break;
   4378         }
   4379 
   4380         old_file_size = bdrv_getlength(bs->file->bs);
   4381         if (old_file_size < 0) {
   4382             error_setg_errno(errp, -old_file_size,
   4383                              "Failed to inquire current file length");
   4384             ret = old_file_size;
   4385             goto fail;
   4386         }
   4387 
   4388         last_cluster = qcow2_get_last_cluster(bs, old_file_size);
   4389         if (last_cluster >= 0) {
   4390             old_file_size = (last_cluster + 1) * s->cluster_size;
   4391         } else {
   4392             old_file_size = ROUND_UP(old_file_size, s->cluster_size);
   4393         }
   4394 
   4395         nb_new_data_clusters = (ROUND_UP(offset, s->cluster_size) -
   4396             start_of_cluster(s, old_length)) >> s->cluster_bits;
   4397 
   4398         /* This is an overestimation; we will not actually allocate space for
   4399          * these in the file but just make sure the new refcount structures are
   4400          * able to cover them so we will not have to allocate new refblocks
   4401          * while entering the data blocks in the potentially new L2 tables.
   4402          * (We do not actually care where the L2 tables are placed. Maybe they
   4403          *  are already allocated or they can be placed somewhere before
   4404          *  @old_file_size. It does not matter because they will be fully
   4405          *  allocated automatically, so they do not need to be covered by the
   4406          *  preallocation. All that matters is that we will not have to allocate
   4407          *  new refcount structures for them.) */
   4408         nb_new_l2_tables = DIV_ROUND_UP(nb_new_data_clusters,
   4409                                         s->cluster_size / l2_entry_size(s));
   4410         /* The cluster range may not be aligned to L2 boundaries, so add one L2
   4411          * table for a potential head/tail */
   4412         nb_new_l2_tables++;
   4413 
   4414         allocation_start = qcow2_refcount_area(bs, old_file_size,
   4415                                                nb_new_data_clusters +
   4416                                                nb_new_l2_tables,
   4417                                                true, 0, 0);
   4418         if (allocation_start < 0) {
   4419             error_setg_errno(errp, -allocation_start,
   4420                              "Failed to resize refcount structures");
   4421             ret = allocation_start;
   4422             goto fail;
   4423         }
   4424 
   4425         clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start,
   4426                                                      nb_new_data_clusters);
   4427         if (clusters_allocated < 0) {
   4428             error_setg_errno(errp, -clusters_allocated,
   4429                              "Failed to allocate data clusters");
   4430             ret = clusters_allocated;
   4431             goto fail;
   4432         }
   4433 
   4434         assert(clusters_allocated == nb_new_data_clusters);
   4435 
   4436         /* Allocate the data area */
   4437         new_file_size = allocation_start +
   4438                         nb_new_data_clusters * s->cluster_size;
   4439         /*
   4440          * Image file grows, so @exact does not matter.
   4441          *
   4442          * If we need to zero out the new area, try first whether the protocol
   4443          * driver can already take care of this.
   4444          */
   4445         if (flags & BDRV_REQ_ZERO_WRITE) {
   4446             ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc,
   4447                                    BDRV_REQ_ZERO_WRITE, NULL);
   4448             if (ret >= 0) {
   4449                 flags &= ~BDRV_REQ_ZERO_WRITE;
   4450                 /* Ensure that we read zeroes and not backing file data */
   4451                 subclusters_need_allocation = true;
   4452             }
   4453         } else {
   4454             ret = -1;
   4455         }
   4456         if (ret < 0) {
   4457             ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0,
   4458                                    errp);
   4459         }
   4460         if (ret < 0) {
   4461             error_prepend(errp, "Failed to resize underlying file: ");
   4462             qcow2_free_clusters(bs, allocation_start,
   4463                                 nb_new_data_clusters * s->cluster_size,
   4464                                 QCOW2_DISCARD_OTHER);
   4465             goto fail;
   4466         }
   4467 
   4468         /* Create the necessary L2 entries */
   4469         host_offset = allocation_start;
   4470         guest_offset = old_length;
   4471         while (nb_new_data_clusters) {
   4472             int64_t nb_clusters = MIN(
   4473                 nb_new_data_clusters,
   4474                 s->l2_slice_size - offset_to_l2_slice_index(s, guest_offset));
   4475             unsigned cow_start_length = offset_into_cluster(s, guest_offset);
   4476             QCowL2Meta allocation;
   4477             guest_offset = start_of_cluster(s, guest_offset);
   4478             allocation = (QCowL2Meta) {
   4479                 .offset       = guest_offset,
   4480                 .alloc_offset = host_offset,
   4481                 .nb_clusters  = nb_clusters,
   4482                 .cow_start    = {
   4483                     .offset       = 0,
   4484                     .nb_bytes     = cow_start_length,
   4485                 },
   4486                 .cow_end      = {
   4487                     .offset       = nb_clusters << s->cluster_bits,
   4488                     .nb_bytes     = 0,
   4489                 },
   4490                 .prealloc     = !subclusters_need_allocation,
   4491             };
   4492             qemu_co_queue_init(&allocation.dependent_requests);
   4493 
   4494             ret = qcow2_alloc_cluster_link_l2(bs, &allocation);
   4495             if (ret < 0) {
   4496                 error_setg_errno(errp, -ret, "Failed to update L2 tables");
   4497                 qcow2_free_clusters(bs, host_offset,
   4498                                     nb_new_data_clusters * s->cluster_size,
   4499                                     QCOW2_DISCARD_OTHER);
   4500                 goto fail;
   4501             }
   4502 
   4503             guest_offset += nb_clusters * s->cluster_size;
   4504             host_offset += nb_clusters * s->cluster_size;
   4505             nb_new_data_clusters -= nb_clusters;
   4506         }
   4507         break;
   4508     }
   4509 
   4510     default:
   4511         g_assert_not_reached();
   4512     }
   4513 
   4514     if ((flags & BDRV_REQ_ZERO_WRITE) && offset > old_length) {
   4515         uint64_t zero_start = QEMU_ALIGN_UP(old_length, s->subcluster_size);
   4516 
   4517         /*
   4518          * Use zero clusters as much as we can. qcow2_subcluster_zeroize()
   4519          * requires a subcluster-aligned start. The end may be unaligned if
   4520          * it is at the end of the image (which it is here).
   4521          */
   4522         if (offset > zero_start) {
   4523             ret = qcow2_subcluster_zeroize(bs, zero_start, offset - zero_start,
   4524                                            0);
   4525             if (ret < 0) {
   4526                 error_setg_errno(errp, -ret, "Failed to zero out new clusters");
   4527                 goto fail;
   4528             }
   4529         }
   4530 
   4531         /* Write explicit zeros for the unaligned head */
   4532         if (zero_start > old_length) {
   4533             uint64_t len = MIN(zero_start, offset) - old_length;
   4534             uint8_t *buf = qemu_blockalign0(bs, len);
   4535             QEMUIOVector qiov;
   4536             qemu_iovec_init_buf(&qiov, buf, len);
   4537 
   4538             qemu_co_mutex_unlock(&s->lock);
   4539             ret = qcow2_co_pwritev_part(bs, old_length, len, &qiov, 0, 0);
   4540             qemu_co_mutex_lock(&s->lock);
   4541 
   4542             qemu_vfree(buf);
   4543             if (ret < 0) {
   4544                 error_setg_errno(errp, -ret, "Failed to zero out the new area");
   4545                 goto fail;
   4546             }
   4547         }
   4548     }
   4549 
   4550     if (prealloc != PREALLOC_MODE_OFF) {
   4551         /* Flush metadata before actually changing the image size */
   4552         ret = qcow2_write_caches(bs);
   4553         if (ret < 0) {
   4554             error_setg_errno(errp, -ret,
   4555                              "Failed to flush the preallocated area to disk");
   4556             goto fail;
   4557         }
   4558     }
   4559 
   4560     bs->total_sectors = offset / BDRV_SECTOR_SIZE;
   4561 
   4562     /* write updated header.size */
   4563     offset = cpu_to_be64(offset);
   4564     ret = bdrv_co_pwrite_sync(bs->file, offsetof(QCowHeader, size),
   4565                               sizeof(offset), &offset, 0);
   4566     if (ret < 0) {
   4567         error_setg_errno(errp, -ret, "Failed to update the image size");
   4568         goto fail;
   4569     }
   4570 
   4571     s->l1_vm_state_index = new_l1_size;
   4572 
   4573     /* Update cache sizes */
   4574     options = qdict_clone_shallow(bs->options);
   4575     ret = qcow2_update_options(bs, options, s->flags, errp);
   4576     qobject_unref(options);
   4577     if (ret < 0) {
   4578         goto fail;
   4579     }
   4580     ret = 0;
   4581 fail:
   4582     qemu_co_mutex_unlock(&s->lock);
   4583     return ret;
   4584 }
   4585 
   4586 static coroutine_fn int
   4587 qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
   4588                                  uint64_t offset, uint64_t bytes,
   4589                                  QEMUIOVector *qiov, size_t qiov_offset)
   4590 {
   4591     BDRVQcow2State *s = bs->opaque;
   4592     int ret;
   4593     ssize_t out_len;
   4594     uint8_t *buf, *out_buf;
   4595     uint64_t cluster_offset;
   4596 
   4597     assert(bytes == s->cluster_size || (bytes < s->cluster_size &&
   4598            (offset + bytes == bs->total_sectors << BDRV_SECTOR_BITS)));
   4599 
   4600     buf = qemu_blockalign(bs, s->cluster_size);
   4601     if (bytes < s->cluster_size) {
   4602         /* Zero-pad last write if image size is not cluster aligned */
   4603         memset(buf + bytes, 0, s->cluster_size - bytes);
   4604     }
   4605     qemu_iovec_to_buf(qiov, qiov_offset, buf, bytes);
   4606 
   4607     out_buf = g_malloc(s->cluster_size);
   4608 
   4609     out_len = qcow2_co_compress(bs, out_buf, s->cluster_size - 1,
   4610                                 buf, s->cluster_size);
   4611     if (out_len == -ENOMEM) {
   4612         /* could not compress: write normal cluster */
   4613         ret = qcow2_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 0);
   4614         if (ret < 0) {
   4615             goto fail;
   4616         }
   4617         goto success;
   4618     } else if (out_len < 0) {
   4619         ret = -EINVAL;
   4620         goto fail;
   4621     }
   4622 
   4623     qemu_co_mutex_lock(&s->lock);
   4624     ret = qcow2_alloc_compressed_cluster_offset(bs, offset, out_len,
   4625                                                 &cluster_offset);
   4626     if (ret < 0) {
   4627         qemu_co_mutex_unlock(&s->lock);
   4628         goto fail;
   4629     }
   4630 
   4631     ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len, true);
   4632     qemu_co_mutex_unlock(&s->lock);
   4633     if (ret < 0) {
   4634         goto fail;
   4635     }
   4636 
   4637     BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
   4638     ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
   4639     if (ret < 0) {
   4640         goto fail;
   4641     }
   4642 success:
   4643     ret = 0;
   4644 fail:
   4645     qemu_vfree(buf);
   4646     g_free(out_buf);
   4647     return ret;
   4648 }
   4649 
   4650 static coroutine_fn int qcow2_co_pwritev_compressed_task_entry(AioTask *task)
   4651 {
   4652     Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);
   4653 
   4654     assert(!t->subcluster_type && !t->l2meta);
   4655 
   4656     return qcow2_co_pwritev_compressed_task(t->bs, t->offset, t->bytes, t->qiov,
   4657                                             t->qiov_offset);
   4658 }
   4659 
   4660 /*
   4661  * XXX: put compressed sectors first, then all the cluster aligned
   4662  * tables to avoid losing bytes in alignment
   4663  */
   4664 static coroutine_fn int
   4665 qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
   4666                                  int64_t offset, int64_t bytes,
   4667                                  QEMUIOVector *qiov, size_t qiov_offset)
   4668 {
   4669     BDRVQcow2State *s = bs->opaque;
   4670     AioTaskPool *aio = NULL;
   4671     int ret = 0;
   4672 
   4673     if (has_data_file(bs)) {
   4674         return -ENOTSUP;
   4675     }
   4676 
   4677     if (bytes == 0) {
   4678         /*
   4679          * align end of file to a sector boundary to ease reading with
   4680          * sector based I/Os
   4681          */
   4682         int64_t len = bdrv_getlength(bs->file->bs);
   4683         if (len < 0) {
   4684             return len;
   4685         }
   4686         return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, 0,
   4687                                 NULL);
   4688     }
   4689 
   4690     if (offset_into_cluster(s, offset)) {
   4691         return -EINVAL;
   4692     }
   4693 
   4694     if (offset_into_cluster(s, bytes) &&
   4695         (offset + bytes) != (bs->total_sectors << BDRV_SECTOR_BITS)) {
   4696         return -EINVAL;
   4697     }
   4698 
   4699     while (bytes && aio_task_pool_status(aio) == 0) {
   4700         uint64_t chunk_size = MIN(bytes, s->cluster_size);
   4701 
   4702         if (!aio && chunk_size != bytes) {
   4703             aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
   4704         }
   4705 
   4706         ret = qcow2_add_task(bs, aio, qcow2_co_pwritev_compressed_task_entry,
   4707                              0, 0, offset, chunk_size, qiov, qiov_offset, NULL);
   4708         if (ret < 0) {
   4709             break;
   4710         }
   4711         qiov_offset += chunk_size;
   4712         offset += chunk_size;
   4713         bytes -= chunk_size;
   4714     }
   4715 
   4716     if (aio) {
   4717         aio_task_pool_wait_all(aio);
   4718         if (ret == 0) {
   4719             ret = aio_task_pool_status(aio);
   4720         }
   4721         g_free(aio);
   4722     }
   4723 
   4724     return ret;
   4725 }
   4726 
   4727 static int coroutine_fn
   4728 qcow2_co_preadv_compressed(BlockDriverState *bs,
   4729                            uint64_t l2_entry,
   4730                            uint64_t offset,
   4731                            uint64_t bytes,
   4732                            QEMUIOVector *qiov,
   4733                            size_t qiov_offset)
   4734 {
   4735     BDRVQcow2State *s = bs->opaque;
   4736     int ret = 0, csize;
   4737     uint64_t coffset;
   4738     uint8_t *buf, *out_buf;
   4739     int offset_in_cluster = offset_into_cluster(s, offset);
   4740 
   4741     qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
   4742 
   4743     buf = g_try_malloc(csize);
   4744     if (!buf) {
   4745         return -ENOMEM;
   4746     }
   4747 
   4748     out_buf = qemu_blockalign(bs, s->cluster_size);
   4749 
   4750     BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
   4751     ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
   4752     if (ret < 0) {
   4753         goto fail;
   4754     }
   4755 
   4756     if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {
   4757         ret = -EIO;
   4758         goto fail;
   4759     }
   4760 
   4761     qemu_iovec_from_buf(qiov, qiov_offset, out_buf + offset_in_cluster, bytes);
   4762 
   4763 fail:
   4764     qemu_vfree(out_buf);
   4765     g_free(buf);
   4766 
   4767     return ret;
   4768 }
   4769 
   4770 static int make_completely_empty(BlockDriverState *bs)
   4771 {
   4772     BDRVQcow2State *s = bs->opaque;
   4773     Error *local_err = NULL;
   4774     int ret, l1_clusters;
   4775     int64_t offset;
   4776     uint64_t *new_reftable = NULL;
   4777     uint64_t rt_entry, l1_size2;
   4778     struct {
   4779         uint64_t l1_offset;
   4780         uint64_t reftable_offset;
   4781         uint32_t reftable_clusters;
   4782     } QEMU_PACKED l1_ofs_rt_ofs_cls;
   4783 
   4784     ret = qcow2_cache_empty(bs, s->l2_table_cache);
   4785     if (ret < 0) {
   4786         goto fail;
   4787     }
   4788 
   4789     ret = qcow2_cache_empty(bs, s->refcount_block_cache);
   4790     if (ret < 0) {
   4791         goto fail;
   4792     }
   4793 
   4794     /* Refcounts will be broken utterly */
   4795     ret = qcow2_mark_dirty(bs);
   4796     if (ret < 0) {
   4797         goto fail;
   4798     }
   4799 
   4800     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
   4801 
   4802     l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / L1E_SIZE);
   4803     l1_size2 = (uint64_t)s->l1_size * L1E_SIZE;
   4804 
   4805     /* After this call, neither the in-memory nor the on-disk refcount
   4806      * information accurately describe the actual references */
   4807 
   4808     ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset,
   4809                              l1_clusters * s->cluster_size, 0);
   4810     if (ret < 0) {
   4811         goto fail_broken_refcounts;
   4812     }
   4813     memset(s->l1_table, 0, l1_size2);
   4814 
   4815     BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
   4816 
   4817     /* Overwrite enough clusters at the beginning of the sectors to place
   4818      * the refcount table, a refcount block and the L1 table in; this may
   4819      * overwrite parts of the existing refcount and L1 table, which is not
   4820      * an issue because the dirty flag is set, complete data loss is in fact
   4821      * desired and partial data loss is consequently fine as well */
   4822     ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size,
   4823                              (2 + l1_clusters) * s->cluster_size, 0);
   4824     /* This call (even if it failed overall) may have overwritten on-disk
   4825      * refcount structures; in that case, the in-memory refcount information
   4826      * will probably differ from the on-disk information which makes the BDS
   4827      * unusable */
   4828     if (ret < 0) {
   4829         goto fail_broken_refcounts;
   4830     }
   4831 
   4832     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
   4833     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
   4834 
   4835     /* "Create" an empty reftable (one cluster) directly after the image
   4836      * header and an empty L1 table three clusters after the image header;
   4837      * the cluster between those two will be used as the first refblock */
   4838     l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size);
   4839     l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size);
   4840     l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1);
   4841     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
   4842                            sizeof(l1_ofs_rt_ofs_cls), &l1_ofs_rt_ofs_cls, 0);
   4843     if (ret < 0) {
   4844         goto fail_broken_refcounts;
   4845     }
   4846 
   4847     s->l1_table_offset = 3 * s->cluster_size;
   4848 
   4849     new_reftable = g_try_new0(uint64_t, s->cluster_size / REFTABLE_ENTRY_SIZE);
   4850     if (!new_reftable) {
   4851         ret = -ENOMEM;
   4852         goto fail_broken_refcounts;
   4853     }
   4854 
   4855     s->refcount_table_offset = s->cluster_size;
   4856     s->refcount_table_size   = s->cluster_size / REFTABLE_ENTRY_SIZE;
   4857     s->max_refcount_table_index = 0;
   4858 
   4859     g_free(s->refcount_table);
   4860     s->refcount_table = new_reftable;
   4861     new_reftable = NULL;
   4862 
   4863     /* Now the in-memory refcount information again corresponds to the on-disk
   4864      * information (reftable is empty and no refblocks (the refblock cache is
   4865      * empty)); however, this means some clusters (e.g. the image header) are
   4866      * referenced, but not refcounted, but the normal qcow2 code assumes that
   4867      * the in-memory information is always correct */
   4868 
   4869     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
   4870 
   4871     /* Enter the first refblock into the reftable */
   4872     rt_entry = cpu_to_be64(2 * s->cluster_size);
   4873     ret = bdrv_pwrite_sync(bs->file, s->cluster_size, sizeof(rt_entry),
   4874                            &rt_entry, 0);
   4875     if (ret < 0) {
   4876         goto fail_broken_refcounts;
   4877     }
   4878     s->refcount_table[0] = 2 * s->cluster_size;
   4879 
   4880     s->free_cluster_index = 0;
   4881     assert(3 + l1_clusters <= s->refcount_block_size);
   4882     offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2);
   4883     if (offset < 0) {
   4884         ret = offset;
   4885         goto fail_broken_refcounts;
   4886     } else if (offset > 0) {
   4887         error_report("First cluster in emptied image is in use");
   4888         abort();
   4889     }
   4890 
   4891     /* Now finally the in-memory information corresponds to the on-disk
   4892      * structures and is correct */
   4893     ret = qcow2_mark_clean(bs);
   4894     if (ret < 0) {
   4895         goto fail;
   4896     }
   4897 
   4898     ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, false,
   4899                         PREALLOC_MODE_OFF, 0, &local_err);
   4900     if (ret < 0) {
   4901         error_report_err(local_err);
   4902         goto fail;
   4903     }
   4904 
   4905     return 0;
   4906 
   4907 fail_broken_refcounts:
   4908     /* The BDS is unusable at this point. If we wanted to make it usable, we
   4909      * would have to call qcow2_refcount_close(), qcow2_refcount_init(),
   4910      * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init()
   4911      * again. However, because the functions which could have caused this error
   4912      * path to be taken are used by those functions as well, it's very likely
   4913      * that that sequence will fail as well. Therefore, just eject the BDS. */
   4914     bs->drv = NULL;
   4915 
   4916 fail:
   4917     g_free(new_reftable);
   4918     return ret;
   4919 }
   4920 
   4921 static int qcow2_make_empty(BlockDriverState *bs)
   4922 {
   4923     BDRVQcow2State *s = bs->opaque;
   4924     uint64_t offset, end_offset;
   4925     int step = QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size);
   4926     int l1_clusters, ret = 0;
   4927 
   4928     l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / L1E_SIZE);
   4929 
   4930     if (s->qcow_version >= 3 && !s->snapshots && !s->nb_bitmaps &&
   4931         3 + l1_clusters <= s->refcount_block_size &&
   4932         s->crypt_method_header != QCOW_CRYPT_LUKS &&
   4933         !has_data_file(bs)) {
   4934         /* The following function only works for qcow2 v3 images (it
   4935          * requires the dirty flag) and only as long as there are no
   4936          * features that reserve extra clusters (such as snapshots,
   4937          * LUKS header, or persistent bitmaps), because it completely
   4938          * empties the image.  Furthermore, the L1 table and three
   4939          * additional clusters (image header, refcount table, one
   4940          * refcount block) have to fit inside one refcount block. It
   4941          * only resets the image file, i.e. does not work with an
   4942          * external data file. */
   4943         return make_completely_empty(bs);
   4944     }
   4945 
   4946     /* This fallback code simply discards every active cluster; this is slow,
   4947      * but works in all cases */
   4948     end_offset = bs->total_sectors * BDRV_SECTOR_SIZE;
   4949     for (offset = 0; offset < end_offset; offset += step) {
   4950         /* As this function is generally used after committing an external
   4951          * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
   4952          * default action for this kind of discard is to pass the discard,
   4953          * which will ideally result in an actually smaller image file, as
   4954          * is probably desired. */
   4955         ret = qcow2_cluster_discard(bs, offset, MIN(step, end_offset - offset),
   4956                                     QCOW2_DISCARD_SNAPSHOT, true);
   4957         if (ret < 0) {
   4958             break;
   4959         }
   4960     }
   4961 
   4962     return ret;
   4963 }
   4964 
   4965 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
   4966 {
   4967     BDRVQcow2State *s = bs->opaque;
   4968     int ret;
   4969 
   4970     qemu_co_mutex_lock(&s->lock);
   4971     ret = qcow2_write_caches(bs);
   4972     qemu_co_mutex_unlock(&s->lock);
   4973 
   4974     return ret;
   4975 }
   4976 
   4977 static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
   4978                                        Error **errp)
   4979 {
   4980     Error *local_err = NULL;
   4981     BlockMeasureInfo *info;
   4982     uint64_t required = 0; /* bytes that contribute to required size */
   4983     uint64_t virtual_size; /* disk size as seen by guest */
   4984     uint64_t refcount_bits;
   4985     uint64_t l2_tables;
   4986     uint64_t luks_payload_size = 0;
   4987     size_t cluster_size;
   4988     int version;
   4989     char *optstr;
   4990     PreallocMode prealloc;
   4991     bool has_backing_file;
   4992     bool has_luks;
   4993     bool extended_l2;
   4994     size_t l2e_size;
   4995 
   4996     /* Parse image creation options */
   4997     extended_l2 = qemu_opt_get_bool_del(opts, BLOCK_OPT_EXTL2, false);
   4998 
   4999     cluster_size = qcow2_opt_get_cluster_size_del(opts, extended_l2,
   5000                                                   &local_err);
   5001     if (local_err) {
   5002         goto err;
   5003     }
   5004 
   5005     version = qcow2_opt_get_version_del(opts, &local_err);
   5006     if (local_err) {
   5007         goto err;
   5008     }
   5009 
   5010     refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err);
   5011     if (local_err) {
   5012         goto err;
   5013     }
   5014 
   5015     optstr = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
   5016     prealloc = qapi_enum_parse(&PreallocMode_lookup, optstr,
   5017                                PREALLOC_MODE_OFF, &local_err);
   5018     g_free(optstr);
   5019     if (local_err) {
   5020         goto err;
   5021     }
   5022 
   5023     optstr = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
   5024     has_backing_file = !!optstr;
   5025     g_free(optstr);
   5026 
   5027     optstr = qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT_FORMAT);
   5028     has_luks = optstr && strcmp(optstr, "luks") == 0;
   5029     g_free(optstr);
   5030 
   5031     if (has_luks) {
   5032         g_autoptr(QCryptoBlockCreateOptions) create_opts = NULL;
   5033         QDict *cryptoopts = qcow2_extract_crypto_opts(opts, "luks", errp);
   5034         size_t headerlen;
   5035 
   5036         create_opts = block_crypto_create_opts_init(cryptoopts, errp);
   5037         qobject_unref(cryptoopts);
   5038         if (!create_opts) {
   5039             goto err;
   5040         }
   5041 
   5042         if (!qcrypto_block_calculate_payload_offset(create_opts,
   5043                                                     "encrypt.",
   5044                                                     &headerlen,
   5045                                                     &local_err)) {
   5046             goto err;
   5047         }
   5048 
   5049         luks_payload_size = ROUND_UP(headerlen, cluster_size);
   5050     }
   5051 
   5052     virtual_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
   5053     virtual_size = ROUND_UP(virtual_size, cluster_size);
   5054 
   5055     /* Check that virtual disk size is valid */
   5056     l2e_size = extended_l2 ? L2E_SIZE_EXTENDED : L2E_SIZE_NORMAL;
   5057     l2_tables = DIV_ROUND_UP(virtual_size / cluster_size,
   5058                              cluster_size / l2e_size);
   5059     if (l2_tables * L1E_SIZE > QCOW_MAX_L1_SIZE) {
   5060         error_setg(&local_err, "The image size is too large "
   5061                                "(try using a larger cluster size)");
   5062         goto err;
   5063     }
   5064 
   5065     /* Account for input image */
   5066     if (in_bs) {
   5067         int64_t ssize = bdrv_getlength(in_bs);
   5068         if (ssize < 0) {
   5069             error_setg_errno(&local_err, -ssize,
   5070                              "Unable to get image virtual_size");
   5071             goto err;
   5072         }
   5073 
   5074         virtual_size = ROUND_UP(ssize, cluster_size);
   5075 
   5076         if (has_backing_file) {
   5077             /* We don't how much of the backing chain is shared by the input
   5078              * image and the new image file.  In the worst case the new image's
   5079              * backing file has nothing in common with the input image.  Be
   5080              * conservative and assume all clusters need to be written.
   5081              */
   5082             required = virtual_size;
   5083         } else {
   5084             int64_t offset;
   5085             int64_t pnum = 0;
   5086 
   5087             for (offset = 0; offset < ssize; offset += pnum) {
   5088                 int ret;
   5089 
   5090                 ret = bdrv_block_status_above(in_bs, NULL, offset,
   5091                                               ssize - offset, &pnum, NULL,
   5092                                               NULL);
   5093                 if (ret < 0) {
   5094                     error_setg_errno(&local_err, -ret,
   5095                                      "Unable to get block status");
   5096                     goto err;
   5097                 }
   5098 
   5099                 if (ret & BDRV_BLOCK_ZERO) {
   5100                     /* Skip zero regions (safe with no backing file) */
   5101                 } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) ==
   5102                            (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) {
   5103                     /* Extend pnum to end of cluster for next iteration */
   5104                     pnum = ROUND_UP(offset + pnum, cluster_size) - offset;
   5105 
   5106                     /* Count clusters we've seen */
   5107                     required += offset % cluster_size + pnum;
   5108                 }
   5109             }
   5110         }
   5111     }
   5112 
   5113     /* Take into account preallocation.  Nothing special is needed for
   5114      * PREALLOC_MODE_METADATA since metadata is always counted.
   5115      */
   5116     if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
   5117         required = virtual_size;
   5118     }
   5119 
   5120     info = g_new0(BlockMeasureInfo, 1);
   5121     info->fully_allocated = luks_payload_size +
   5122         qcow2_calc_prealloc_size(virtual_size, cluster_size,
   5123                                  ctz32(refcount_bits), extended_l2);
   5124 
   5125     /*
   5126      * Remove data clusters that are not required.  This overestimates the
   5127      * required size because metadata needed for the fully allocated file is
   5128      * still counted.  Show bitmaps only if both source and destination
   5129      * would support them.
   5130      */
   5131     info->required = info->fully_allocated - virtual_size + required;
   5132     info->has_bitmaps = version >= 3 && in_bs &&
   5133         bdrv_supports_persistent_dirty_bitmap(in_bs);
   5134     if (info->has_bitmaps) {
   5135         info->bitmaps = qcow2_get_persistent_dirty_bitmap_size(in_bs,
   5136                                                                cluster_size);
   5137     }
   5138     return info;
   5139 
   5140 err:
   5141     error_propagate(errp, local_err);
   5142     return NULL;
   5143 }
   5144 
   5145 static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
   5146 {
   5147     BDRVQcow2State *s = bs->opaque;
   5148     bdi->cluster_size = s->cluster_size;
   5149     bdi->vm_state_offset = qcow2_vm_state_offset(s);
   5150     bdi->is_dirty = s->incompatible_features & QCOW2_INCOMPAT_DIRTY;
   5151     return 0;
   5152 }
   5153 
   5154 static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs,
   5155                                                   Error **errp)
   5156 {
   5157     BDRVQcow2State *s = bs->opaque;
   5158     ImageInfoSpecific *spec_info;
   5159     QCryptoBlockInfo *encrypt_info = NULL;
   5160 
   5161     if (s->crypto != NULL) {
   5162         encrypt_info = qcrypto_block_get_info(s->crypto, errp);
   5163         if (!encrypt_info) {
   5164             return NULL;
   5165         }
   5166     }
   5167 
   5168     spec_info = g_new(ImageInfoSpecific, 1);
   5169     *spec_info = (ImageInfoSpecific){
   5170         .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
   5171         .u.qcow2.data = g_new0(ImageInfoSpecificQCow2, 1),
   5172     };
   5173     if (s->qcow_version == 2) {
   5174         *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
   5175             .compat             = g_strdup("0.10"),
   5176             .refcount_bits      = s->refcount_bits,
   5177         };
   5178     } else if (s->qcow_version == 3) {
   5179         Qcow2BitmapInfoList *bitmaps;
   5180         if (!qcow2_get_bitmap_info_list(bs, &bitmaps, errp)) {
   5181             qapi_free_ImageInfoSpecific(spec_info);
   5182             qapi_free_QCryptoBlockInfo(encrypt_info);
   5183             return NULL;
   5184         }
   5185         *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
   5186             .compat             = g_strdup("1.1"),
   5187             .lazy_refcounts     = s->compatible_features &
   5188                                   QCOW2_COMPAT_LAZY_REFCOUNTS,
   5189             .has_lazy_refcounts = true,
   5190             .corrupt            = s->incompatible_features &
   5191                                   QCOW2_INCOMPAT_CORRUPT,
   5192             .has_corrupt        = true,
   5193             .has_extended_l2    = true,
   5194             .extended_l2        = has_subclusters(s),
   5195             .refcount_bits      = s->refcount_bits,
   5196             .has_bitmaps        = !!bitmaps,
   5197             .bitmaps            = bitmaps,
   5198             .has_data_file      = !!s->image_data_file,
   5199             .data_file          = g_strdup(s->image_data_file),
   5200             .has_data_file_raw  = has_data_file(bs),
   5201             .data_file_raw      = data_file_is_raw(bs),
   5202             .compression_type   = s->compression_type,
   5203         };
   5204     } else {
   5205         /* if this assertion fails, this probably means a new version was
   5206          * added without having it covered here */
   5207         assert(false);
   5208     }
   5209 
   5210     if (encrypt_info) {
   5211         ImageInfoSpecificQCow2Encryption *qencrypt =
   5212             g_new(ImageInfoSpecificQCow2Encryption, 1);
   5213         switch (encrypt_info->format) {
   5214         case Q_CRYPTO_BLOCK_FORMAT_QCOW:
   5215             qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_AES;
   5216             break;
   5217         case Q_CRYPTO_BLOCK_FORMAT_LUKS:
   5218             qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_LUKS;
   5219             qencrypt->u.luks = encrypt_info->u.luks;
   5220             break;
   5221         default:
   5222             abort();
   5223         }
   5224         /* Since we did shallow copy above, erase any pointers
   5225          * in the original info */
   5226         memset(&encrypt_info->u, 0, sizeof(encrypt_info->u));
   5227         qapi_free_QCryptoBlockInfo(encrypt_info);
   5228 
   5229         spec_info->u.qcow2.data->has_encrypt = true;
   5230         spec_info->u.qcow2.data->encrypt = qencrypt;
   5231     }
   5232 
   5233     return spec_info;
   5234 }
   5235 
   5236 static int qcow2_has_zero_init(BlockDriverState *bs)
   5237 {
   5238     BDRVQcow2State *s = bs->opaque;
   5239     bool preallocated;
   5240 
   5241     if (qemu_in_coroutine()) {
   5242         qemu_co_mutex_lock(&s->lock);
   5243     }
   5244     /*
   5245      * Check preallocation status: Preallocated images have all L2
   5246      * tables allocated, nonpreallocated images have none.  It is
   5247      * therefore enough to check the first one.
   5248      */
   5249     preallocated = s->l1_size > 0 && s->l1_table[0] != 0;
   5250     if (qemu_in_coroutine()) {
   5251         qemu_co_mutex_unlock(&s->lock);
   5252     }
   5253 
   5254     if (!preallocated) {
   5255         return 1;
   5256     } else if (bs->encrypted) {
   5257         return 0;
   5258     } else {
   5259         return bdrv_has_zero_init(s->data_file->bs);
   5260     }
   5261 }
   5262 
   5263 /*
   5264  * Check the request to vmstate. On success return
   5265  *      qcow2_vm_state_offset(bs) + @pos
   5266  */
   5267 static int64_t qcow2_check_vmstate_request(BlockDriverState *bs,
   5268                                            QEMUIOVector *qiov, int64_t pos)
   5269 {
   5270     BDRVQcow2State *s = bs->opaque;
   5271     int64_t vmstate_offset = qcow2_vm_state_offset(s);
   5272     int ret;
   5273 
   5274     /* Incoming requests must be OK */
   5275     bdrv_check_qiov_request(pos, qiov->size, qiov, 0, &error_abort);
   5276 
   5277     if (INT64_MAX - pos < vmstate_offset) {
   5278         return -EIO;
   5279     }
   5280 
   5281     pos += vmstate_offset;
   5282     ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
   5283     if (ret < 0) {
   5284         return ret;
   5285     }
   5286 
   5287     return pos;
   5288 }
   5289 
   5290 static coroutine_fn int qcow2_save_vmstate(BlockDriverState *bs,
   5291                                            QEMUIOVector *qiov, int64_t pos)
   5292 {
   5293     int64_t offset = qcow2_check_vmstate_request(bs, qiov, pos);
   5294     if (offset < 0) {
   5295         return offset;
   5296     }
   5297 
   5298     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
   5299     return bs->drv->bdrv_co_pwritev_part(bs, offset, qiov->size, qiov, 0, 0);
   5300 }
   5301 
   5302 static coroutine_fn int qcow2_load_vmstate(BlockDriverState *bs,
   5303                                            QEMUIOVector *qiov, int64_t pos)
   5304 {
   5305     int64_t offset = qcow2_check_vmstate_request(bs, qiov, pos);
   5306     if (offset < 0) {
   5307         return offset;
   5308     }
   5309 
   5310     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
   5311     return bs->drv->bdrv_co_preadv_part(bs, offset, qiov->size, qiov, 0, 0);
   5312 }
   5313 
   5314 static int qcow2_has_compressed_clusters(BlockDriverState *bs)
   5315 {
   5316     int64_t offset = 0;
   5317     int64_t bytes = bdrv_getlength(bs);
   5318 
   5319     if (bytes < 0) {
   5320         return bytes;
   5321     }
   5322 
   5323     while (bytes != 0) {
   5324         int ret;
   5325         QCow2SubclusterType type;
   5326         unsigned int cur_bytes = MIN(INT_MAX, bytes);
   5327         uint64_t host_offset;
   5328 
   5329         ret = qcow2_get_host_offset(bs, offset, &cur_bytes, &host_offset,
   5330                                     &type);
   5331         if (ret < 0) {
   5332             return ret;
   5333         }
   5334 
   5335         if (type == QCOW2_SUBCLUSTER_COMPRESSED) {
   5336             return 1;
   5337         }
   5338 
   5339         offset += cur_bytes;
   5340         bytes -= cur_bytes;
   5341     }
   5342 
   5343     return 0;
   5344 }
   5345 
   5346 /*
   5347  * Downgrades an image's version. To achieve this, any incompatible features
   5348  * have to be removed.
   5349  */
   5350 static int qcow2_downgrade(BlockDriverState *bs, int target_version,
   5351                            BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
   5352                            Error **errp)
   5353 {
   5354     BDRVQcow2State *s = bs->opaque;
   5355     int current_version = s->qcow_version;
   5356     int ret;
   5357     int i;
   5358 
   5359     /* This is qcow2_downgrade(), not qcow2_upgrade() */
   5360     assert(target_version < current_version);
   5361 
   5362     /* There are no other versions (now) that you can downgrade to */
   5363     assert(target_version == 2);
   5364 
   5365     if (s->refcount_order != 4) {
   5366         error_setg(errp, "compat=0.10 requires refcount_bits=16");
   5367         return -ENOTSUP;
   5368     }
   5369 
   5370     if (has_data_file(bs)) {
   5371         error_setg(errp, "Cannot downgrade an image with a data file");
   5372         return -ENOTSUP;
   5373     }
   5374 
   5375     /*
   5376      * If any internal snapshot has a different size than the current
   5377      * image size, or VM state size that exceeds 32 bits, downgrading
   5378      * is unsafe.  Even though we would still use v3-compliant output
   5379      * to preserve that data, other v2 programs might not realize
   5380      * those optional fields are important.
   5381      */
   5382     for (i = 0; i < s->nb_snapshots; i++) {
   5383         if (s->snapshots[i].vm_state_size > UINT32_MAX ||
   5384             s->snapshots[i].disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) {
   5385             error_setg(errp, "Internal snapshots prevent downgrade of image");
   5386             return -ENOTSUP;
   5387         }
   5388     }
   5389 
   5390     /* clear incompatible features */
   5391     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
   5392         ret = qcow2_mark_clean(bs);
   5393         if (ret < 0) {
   5394             error_setg_errno(errp, -ret, "Failed to make the image clean");
   5395             return ret;
   5396         }
   5397     }
   5398 
   5399     /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in
   5400      * the first place; if that happens nonetheless, returning -ENOTSUP is the
   5401      * best thing to do anyway */
   5402 
   5403     if (s->incompatible_features & ~QCOW2_INCOMPAT_COMPRESSION) {
   5404         error_setg(errp, "Cannot downgrade an image with incompatible features "
   5405                    "0x%" PRIx64 " set",
   5406                    s->incompatible_features & ~QCOW2_INCOMPAT_COMPRESSION);
   5407         return -ENOTSUP;
   5408     }
   5409 
   5410     /* since we can ignore compatible features, we can set them to 0 as well */
   5411     s->compatible_features = 0;
   5412     /* if lazy refcounts have been used, they have already been fixed through
   5413      * clearing the dirty flag */
   5414 
   5415     /* clearing autoclear features is trivial */
   5416     s->autoclear_features = 0;
   5417 
   5418     ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque);
   5419     if (ret < 0) {
   5420         error_setg_errno(errp, -ret, "Failed to turn zero into data clusters");
   5421         return ret;
   5422     }
   5423 
   5424     if (s->incompatible_features & QCOW2_INCOMPAT_COMPRESSION) {
   5425         ret = qcow2_has_compressed_clusters(bs);
   5426         if (ret < 0) {
   5427             error_setg(errp, "Failed to check block status");
   5428             return -EINVAL;
   5429         }
   5430         if (ret) {
   5431             error_setg(errp, "Cannot downgrade an image with zstd compression "
   5432                        "type and existing compressed clusters");
   5433             return -ENOTSUP;
   5434         }
   5435         /*
   5436          * No compressed clusters for now, so just chose default zlib
   5437          * compression.
   5438          */
   5439         s->incompatible_features &= ~QCOW2_INCOMPAT_COMPRESSION;
   5440         s->compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
   5441     }
   5442 
   5443     assert(s->incompatible_features == 0);
   5444 
   5445     s->qcow_version = target_version;
   5446     ret = qcow2_update_header(bs);
   5447     if (ret < 0) {
   5448         s->qcow_version = current_version;
   5449         error_setg_errno(errp, -ret, "Failed to update the image header");
   5450         return ret;
   5451     }
   5452     return 0;
   5453 }
   5454 
   5455 /*
   5456  * Upgrades an image's version.  While newer versions encompass all
   5457  * features of older versions, some things may have to be presented
   5458  * differently.
   5459  */
   5460 static int qcow2_upgrade(BlockDriverState *bs, int target_version,
   5461                          BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
   5462                          Error **errp)
   5463 {
   5464     BDRVQcow2State *s = bs->opaque;
   5465     bool need_snapshot_update;
   5466     int current_version = s->qcow_version;
   5467     int i;
   5468     int ret;
   5469 
   5470     /* This is qcow2_upgrade(), not qcow2_downgrade() */
   5471     assert(target_version > current_version);
   5472 
   5473     /* There are no other versions (yet) that you can upgrade to */
   5474     assert(target_version == 3);
   5475 
   5476     status_cb(bs, 0, 2, cb_opaque);
   5477 
   5478     /*
   5479      * In v2, snapshots do not need to have extra data.  v3 requires
   5480      * the 64-bit VM state size and the virtual disk size to be
   5481      * present.
   5482      * qcow2_write_snapshots() will always write the list in the
   5483      * v3-compliant format.
   5484      */
   5485     need_snapshot_update = false;
   5486     for (i = 0; i < s->nb_snapshots; i++) {
   5487         if (s->snapshots[i].extra_data_size <
   5488             sizeof_field(QCowSnapshotExtraData, vm_state_size_large) +
   5489             sizeof_field(QCowSnapshotExtraData, disk_size))
   5490         {
   5491             need_snapshot_update = true;
   5492             break;
   5493         }
   5494     }
   5495     if (need_snapshot_update) {
   5496         ret = qcow2_write_snapshots(bs);
   5497         if (ret < 0) {
   5498             error_setg_errno(errp, -ret, "Failed to update the snapshot table");
   5499             return ret;
   5500         }
   5501     }
   5502     status_cb(bs, 1, 2, cb_opaque);
   5503 
   5504     s->qcow_version = target_version;
   5505     ret = qcow2_update_header(bs);
   5506     if (ret < 0) {
   5507         s->qcow_version = current_version;
   5508         error_setg_errno(errp, -ret, "Failed to update the image header");
   5509         return ret;
   5510     }
   5511     status_cb(bs, 2, 2, cb_opaque);
   5512 
   5513     return 0;
   5514 }
   5515 
   5516 typedef enum Qcow2AmendOperation {
   5517     /* This is the value Qcow2AmendHelperCBInfo::last_operation will be
   5518      * statically initialized to so that the helper CB can discern the first
   5519      * invocation from an operation change */
   5520     QCOW2_NO_OPERATION = 0,
   5521 
   5522     QCOW2_UPGRADING,
   5523     QCOW2_UPDATING_ENCRYPTION,
   5524     QCOW2_CHANGING_REFCOUNT_ORDER,
   5525     QCOW2_DOWNGRADING,
   5526 } Qcow2AmendOperation;
   5527 
   5528 typedef struct Qcow2AmendHelperCBInfo {
   5529     /* The code coordinating the amend operations should only modify
   5530      * these four fields; the rest will be managed by the CB */
   5531     BlockDriverAmendStatusCB *original_status_cb;
   5532     void *original_cb_opaque;
   5533 
   5534     Qcow2AmendOperation current_operation;
   5535 
   5536     /* Total number of operations to perform (only set once) */
   5537     int total_operations;
   5538 
   5539     /* The following fields are managed by the CB */
   5540 
   5541     /* Number of operations completed */
   5542     int operations_completed;
   5543 
   5544     /* Cumulative offset of all completed operations */
   5545     int64_t offset_completed;
   5546 
   5547     Qcow2AmendOperation last_operation;
   5548     int64_t last_work_size;
   5549 } Qcow2AmendHelperCBInfo;
   5550 
   5551 static void qcow2_amend_helper_cb(BlockDriverState *bs,
   5552                                   int64_t operation_offset,
   5553                                   int64_t operation_work_size, void *opaque)
   5554 {
   5555     Qcow2AmendHelperCBInfo *info = opaque;
   5556     int64_t current_work_size;
   5557     int64_t projected_work_size;
   5558 
   5559     if (info->current_operation != info->last_operation) {
   5560         if (info->last_operation != QCOW2_NO_OPERATION) {
   5561             info->offset_completed += info->last_work_size;
   5562             info->operations_completed++;
   5563         }
   5564 
   5565         info->last_operation = info->current_operation;
   5566     }
   5567 
   5568     assert(info->total_operations > 0);
   5569     assert(info->operations_completed < info->total_operations);
   5570 
   5571     info->last_work_size = operation_work_size;
   5572 
   5573     current_work_size = info->offset_completed + operation_work_size;
   5574 
   5575     /* current_work_size is the total work size for (operations_completed + 1)
   5576      * operations (which includes this one), so multiply it by the number of
   5577      * operations not covered and divide it by the number of operations
   5578      * covered to get a projection for the operations not covered */
   5579     projected_work_size = current_work_size * (info->total_operations -
   5580                                                info->operations_completed - 1)
   5581                                             / (info->operations_completed + 1);
   5582 
   5583     info->original_status_cb(bs, info->offset_completed + operation_offset,
   5584                              current_work_size + projected_work_size,
   5585                              info->original_cb_opaque);
   5586 }
   5587 
   5588 static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
   5589                                BlockDriverAmendStatusCB *status_cb,
   5590                                void *cb_opaque,
   5591                                bool force,
   5592                                Error **errp)
   5593 {
   5594     BDRVQcow2State *s = bs->opaque;
   5595     int old_version = s->qcow_version, new_version = old_version;
   5596     uint64_t new_size = 0;
   5597     const char *backing_file = NULL, *backing_format = NULL, *data_file = NULL;
   5598     bool lazy_refcounts = s->use_lazy_refcounts;
   5599     bool data_file_raw = data_file_is_raw(bs);
   5600     const char *compat = NULL;
   5601     int refcount_bits = s->refcount_bits;
   5602     int ret;
   5603     QemuOptDesc *desc = opts->list->desc;
   5604     Qcow2AmendHelperCBInfo helper_cb_info;
   5605     bool encryption_update = false;
   5606 
   5607     while (desc && desc->name) {
   5608         if (!qemu_opt_find(opts, desc->name)) {
   5609             /* only change explicitly defined options */
   5610             desc++;
   5611             continue;
   5612         }
   5613 
   5614         if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) {
   5615             compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL);
   5616             if (!compat) {
   5617                 /* preserve default */
   5618             } else if (!strcmp(compat, "0.10") || !strcmp(compat, "v2")) {
   5619                 new_version = 2;
   5620             } else if (!strcmp(compat, "1.1") || !strcmp(compat, "v3")) {
   5621                 new_version = 3;
   5622             } else {
   5623                 error_setg(errp, "Unknown compatibility level %s", compat);
   5624                 return -EINVAL;
   5625             }
   5626         } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) {
   5627             new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
   5628         } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) {
   5629             backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
   5630         } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) {
   5631             backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
   5632         } else if (g_str_has_prefix(desc->name, "encrypt.")) {
   5633             if (!s->crypto) {
   5634                 error_setg(errp,
   5635                            "Can't amend encryption options - encryption not present");
   5636                 return -EINVAL;
   5637             }
   5638             if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
   5639                 error_setg(errp,
   5640                            "Only LUKS encryption options can be amended");
   5641                 return -ENOTSUP;
   5642             }
   5643             encryption_update = true;
   5644         } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
   5645             lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS,
   5646                                                lazy_refcounts);
   5647         } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) {
   5648             refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS,
   5649                                                 refcount_bits);
   5650 
   5651             if (refcount_bits <= 0 || refcount_bits > 64 ||
   5652                 !is_power_of_2(refcount_bits))
   5653             {
   5654                 error_setg(errp, "Refcount width must be a power of two and "
   5655                            "may not exceed 64 bits");
   5656                 return -EINVAL;
   5657             }
   5658         } else if (!strcmp(desc->name, BLOCK_OPT_DATA_FILE)) {
   5659             data_file = qemu_opt_get(opts, BLOCK_OPT_DATA_FILE);
   5660             if (data_file && !has_data_file(bs)) {
   5661                 error_setg(errp, "data-file can only be set for images that "
   5662                                  "use an external data file");
   5663                 return -EINVAL;
   5664             }
   5665         } else if (!strcmp(desc->name, BLOCK_OPT_DATA_FILE_RAW)) {
   5666             data_file_raw = qemu_opt_get_bool(opts, BLOCK_OPT_DATA_FILE_RAW,
   5667                                               data_file_raw);
   5668             if (data_file_raw && !data_file_is_raw(bs)) {
   5669                 error_setg(errp, "data-file-raw cannot be set on existing "
   5670                                  "images");
   5671                 return -EINVAL;
   5672             }
   5673         } else {
   5674             /* if this point is reached, this probably means a new option was
   5675              * added without having it covered here */
   5676             abort();
   5677         }
   5678 
   5679         desc++;
   5680     }
   5681 
   5682     helper_cb_info = (Qcow2AmendHelperCBInfo){
   5683         .original_status_cb = status_cb,
   5684         .original_cb_opaque = cb_opaque,
   5685         .total_operations = (new_version != old_version)
   5686                           + (s->refcount_bits != refcount_bits) +
   5687                             (encryption_update == true)
   5688     };
   5689 
   5690     /* Upgrade first (some features may require compat=1.1) */
   5691     if (new_version > old_version) {
   5692         helper_cb_info.current_operation = QCOW2_UPGRADING;
   5693         ret = qcow2_upgrade(bs, new_version, &qcow2_amend_helper_cb,
   5694                             &helper_cb_info, errp);
   5695         if (ret < 0) {
   5696             return ret;
   5697         }
   5698     }
   5699 
   5700     if (encryption_update) {
   5701         QDict *amend_opts_dict;
   5702         QCryptoBlockAmendOptions *amend_opts;
   5703 
   5704         helper_cb_info.current_operation = QCOW2_UPDATING_ENCRYPTION;
   5705         amend_opts_dict = qcow2_extract_crypto_opts(opts, "luks", errp);
   5706         if (!amend_opts_dict) {
   5707             return -EINVAL;
   5708         }
   5709         amend_opts = block_crypto_amend_opts_init(amend_opts_dict, errp);
   5710         qobject_unref(amend_opts_dict);
   5711         if (!amend_opts) {
   5712             return -EINVAL;
   5713         }
   5714         ret = qcrypto_block_amend_options(s->crypto,
   5715                                           qcow2_crypto_hdr_read_func,
   5716                                           qcow2_crypto_hdr_write_func,
   5717                                           bs,
   5718                                           amend_opts,
   5719                                           force,
   5720                                           errp);
   5721         qapi_free_QCryptoBlockAmendOptions(amend_opts);
   5722         if (ret < 0) {
   5723             return ret;
   5724         }
   5725     }
   5726 
   5727     if (s->refcount_bits != refcount_bits) {
   5728         int refcount_order = ctz32(refcount_bits);
   5729 
   5730         if (new_version < 3 && refcount_bits != 16) {
   5731             error_setg(errp, "Refcount widths other than 16 bits require "
   5732                        "compatibility level 1.1 or above (use compat=1.1 or "
   5733                        "greater)");
   5734             return -EINVAL;
   5735         }
   5736 
   5737         helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER;
   5738         ret = qcow2_change_refcount_order(bs, refcount_order,
   5739                                           &qcow2_amend_helper_cb,
   5740                                           &helper_cb_info, errp);
   5741         if (ret < 0) {
   5742             return ret;
   5743         }
   5744     }
   5745 
   5746     /* data-file-raw blocks backing files, so clear it first if requested */
   5747     if (data_file_raw) {
   5748         s->autoclear_features |= QCOW2_AUTOCLEAR_DATA_FILE_RAW;
   5749     } else {
   5750         s->autoclear_features &= ~QCOW2_AUTOCLEAR_DATA_FILE_RAW;
   5751     }
   5752 
   5753     if (data_file) {
   5754         g_free(s->image_data_file);
   5755         s->image_data_file = *data_file ? g_strdup(data_file) : NULL;
   5756     }
   5757 
   5758     ret = qcow2_update_header(bs);
   5759     if (ret < 0) {
   5760         error_setg_errno(errp, -ret, "Failed to update the image header");
   5761         return ret;
   5762     }
   5763 
   5764     if (backing_file || backing_format) {
   5765         if (g_strcmp0(backing_file, s->image_backing_file) ||
   5766             g_strcmp0(backing_format, s->image_backing_format)) {
   5767             error_setg(errp, "Cannot amend the backing file");
   5768             error_append_hint(errp,
   5769                               "You can use 'qemu-img rebase' instead.\n");
   5770             return -EINVAL;
   5771         }
   5772     }
   5773 
   5774     if (s->use_lazy_refcounts != lazy_refcounts) {
   5775         if (lazy_refcounts) {
   5776             if (new_version < 3) {
   5777                 error_setg(errp, "Lazy refcounts only supported with "
   5778                            "compatibility level 1.1 and above (use compat=1.1 "
   5779                            "or greater)");
   5780                 return -EINVAL;
   5781             }
   5782             s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
   5783             ret = qcow2_update_header(bs);
   5784             if (ret < 0) {
   5785                 s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
   5786                 error_setg_errno(errp, -ret, "Failed to update the image header");
   5787                 return ret;
   5788             }
   5789             s->use_lazy_refcounts = true;
   5790         } else {
   5791             /* make image clean first */
   5792             ret = qcow2_mark_clean(bs);
   5793             if (ret < 0) {
   5794                 error_setg_errno(errp, -ret, "Failed to make the image clean");
   5795                 return ret;
   5796             }
   5797             /* now disallow lazy refcounts */
   5798             s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
   5799             ret = qcow2_update_header(bs);
   5800             if (ret < 0) {
   5801                 s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
   5802                 error_setg_errno(errp, -ret, "Failed to update the image header");
   5803                 return ret;
   5804             }
   5805             s->use_lazy_refcounts = false;
   5806         }
   5807     }
   5808 
   5809     if (new_size) {
   5810         BlockBackend *blk = blk_new_with_bs(bs, BLK_PERM_RESIZE, BLK_PERM_ALL,
   5811                                             errp);
   5812         if (!blk) {
   5813             return -EPERM;
   5814         }
   5815 
   5816         /*
   5817          * Amending image options should ensure that the image has
   5818          * exactly the given new values, so pass exact=true here.
   5819          */
   5820         ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, 0, errp);
   5821         blk_unref(blk);
   5822         if (ret < 0) {
   5823             return ret;
   5824         }
   5825     }
   5826 
   5827     /* Downgrade last (so unsupported features can be removed before) */
   5828     if (new_version < old_version) {
   5829         helper_cb_info.current_operation = QCOW2_DOWNGRADING;
   5830         ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb,
   5831                               &helper_cb_info, errp);
   5832         if (ret < 0) {
   5833             return ret;
   5834         }
   5835     }
   5836 
   5837     return 0;
   5838 }
   5839 
   5840 static int coroutine_fn qcow2_co_amend(BlockDriverState *bs,
   5841                                        BlockdevAmendOptions *opts,
   5842                                        bool force,
   5843                                        Error **errp)
   5844 {
   5845     BlockdevAmendOptionsQcow2 *qopts = &opts->u.qcow2;
   5846     BDRVQcow2State *s = bs->opaque;
   5847     int ret = 0;
   5848 
   5849     if (qopts->has_encrypt) {
   5850         if (!s->crypto) {
   5851             error_setg(errp, "image is not encrypted, can't amend");
   5852             return -EOPNOTSUPP;
   5853         }
   5854 
   5855         if (qopts->encrypt->format != Q_CRYPTO_BLOCK_FORMAT_LUKS) {
   5856             error_setg(errp,
   5857                        "Amend can't be used to change the qcow2 encryption format");
   5858             return -EOPNOTSUPP;
   5859         }
   5860 
   5861         if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
   5862             error_setg(errp,
   5863                        "Only LUKS encryption options can be amended for qcow2 with blockdev-amend");
   5864             return -EOPNOTSUPP;
   5865         }
   5866 
   5867         ret = qcrypto_block_amend_options(s->crypto,
   5868                                           qcow2_crypto_hdr_read_func,
   5869                                           qcow2_crypto_hdr_write_func,
   5870                                           bs,
   5871                                           qopts->encrypt,
   5872                                           force,
   5873                                           errp);
   5874     }
   5875     return ret;
   5876 }
   5877 
   5878 /*
   5879  * If offset or size are negative, respectively, they will not be included in
   5880  * the BLOCK_IMAGE_CORRUPTED event emitted.
   5881  * fatal will be ignored for read-only BDS; corruptions found there will always
   5882  * be considered non-fatal.
   5883  */
   5884 void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
   5885                              int64_t size, const char *message_format, ...)
   5886 {
   5887     BDRVQcow2State *s = bs->opaque;
   5888     const char *node_name;
   5889     char *message;
   5890     va_list ap;
   5891 
   5892     fatal = fatal && bdrv_is_writable(bs);
   5893 
   5894     if (s->signaled_corruption &&
   5895         (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
   5896     {
   5897         return;
   5898     }
   5899 
   5900     va_start(ap, message_format);
   5901     message = g_strdup_vprintf(message_format, ap);
   5902     va_end(ap);
   5903 
   5904     if (fatal) {
   5905         fprintf(stderr, "qcow2: Marking image as corrupt: %s; further "
   5906                 "corruption events will be suppressed\n", message);
   5907     } else {
   5908         fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal "
   5909                 "corruption events will be suppressed\n", message);
   5910     }
   5911 
   5912     node_name = bdrv_get_node_name(bs);
   5913     qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
   5914                                           *node_name != '\0', node_name,
   5915                                           message, offset >= 0, offset,
   5916                                           size >= 0, size,
   5917                                           fatal);
   5918     g_free(message);
   5919 
   5920     if (fatal) {
   5921         qcow2_mark_corrupt(bs);
   5922         bs->drv = NULL; /* make BDS unusable */
   5923     }
   5924 
   5925     s->signaled_corruption = true;
   5926 }
   5927 
   5928 #define QCOW_COMMON_OPTIONS                                         \
   5929     {                                                               \
   5930         .name = BLOCK_OPT_SIZE,                                     \
   5931         .type = QEMU_OPT_SIZE,                                      \
   5932         .help = "Virtual disk size"                                 \
   5933     },                                                              \
   5934     {                                                               \
   5935         .name = BLOCK_OPT_COMPAT_LEVEL,                             \
   5936         .type = QEMU_OPT_STRING,                                    \
   5937         .help = "Compatibility level (v2 [0.10] or v3 [1.1])"       \
   5938     },                                                              \
   5939     {                                                               \
   5940         .name = BLOCK_OPT_BACKING_FILE,                             \
   5941         .type = QEMU_OPT_STRING,                                    \
   5942         .help = "File name of a base image"                         \
   5943     },                                                              \
   5944     {                                                               \
   5945         .name = BLOCK_OPT_BACKING_FMT,                              \
   5946         .type = QEMU_OPT_STRING,                                    \
   5947         .help = "Image format of the base image"                    \
   5948     },                                                              \
   5949     {                                                               \
   5950         .name = BLOCK_OPT_DATA_FILE,                                \
   5951         .type = QEMU_OPT_STRING,                                    \
   5952         .help = "File name of an external data file"                \
   5953     },                                                              \
   5954     {                                                               \
   5955         .name = BLOCK_OPT_DATA_FILE_RAW,                            \
   5956         .type = QEMU_OPT_BOOL,                                      \
   5957         .help = "The external data file must stay valid "           \
   5958                 "as a raw image"                                    \
   5959     },                                                              \
   5960     {                                                               \
   5961         .name = BLOCK_OPT_LAZY_REFCOUNTS,                           \
   5962         .type = QEMU_OPT_BOOL,                                      \
   5963         .help = "Postpone refcount updates",                        \
   5964         .def_value_str = "off"                                      \
   5965     },                                                              \
   5966     {                                                               \
   5967         .name = BLOCK_OPT_REFCOUNT_BITS,                            \
   5968         .type = QEMU_OPT_NUMBER,                                    \
   5969         .help = "Width of a reference count entry in bits",         \
   5970         .def_value_str = "16"                                       \
   5971     }
   5972 
   5973 static QemuOptsList qcow2_create_opts = {
   5974     .name = "qcow2-create-opts",
   5975     .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head),
   5976     .desc = {
   5977         {                                                               \
   5978             .name = BLOCK_OPT_ENCRYPT,                                  \
   5979             .type = QEMU_OPT_BOOL,                                      \
   5980             .help = "Encrypt the image with format 'aes'. (Deprecated " \
   5981                     "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",    \
   5982         },                                                              \
   5983         {                                                               \
   5984             .name = BLOCK_OPT_ENCRYPT_FORMAT,                           \
   5985             .type = QEMU_OPT_STRING,                                    \
   5986             .help = "Encrypt the image, format choices: 'aes', 'luks'", \
   5987         },                                                              \
   5988         BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",                     \
   5989             "ID of secret providing qcow AES key or LUKS passphrase"),  \
   5990         BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG("encrypt."),               \
   5991         BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE("encrypt."),              \
   5992         BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG("encrypt."),                \
   5993         BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG("encrypt."),           \
   5994         BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG("encrypt."),                 \
   5995         BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."),                \
   5996         {                                                               \
   5997             .name = BLOCK_OPT_CLUSTER_SIZE,                             \
   5998             .type = QEMU_OPT_SIZE,                                      \
   5999             .help = "qcow2 cluster size",                               \
   6000             .def_value_str = stringify(DEFAULT_CLUSTER_SIZE)            \
   6001         },                                                              \
   6002         {                                                               \
   6003             .name = BLOCK_OPT_EXTL2,                                    \
   6004             .type = QEMU_OPT_BOOL,                                      \
   6005             .help = "Extended L2 tables",                               \
   6006             .def_value_str = "off"                                      \
   6007         },                                                              \
   6008         {                                                               \
   6009             .name = BLOCK_OPT_PREALLOC,                                 \
   6010             .type = QEMU_OPT_STRING,                                    \
   6011             .help = "Preallocation mode (allowed values: off, "         \
   6012                     "metadata, falloc, full)"                           \
   6013         },                                                              \
   6014         {                                                               \
   6015             .name = BLOCK_OPT_COMPRESSION_TYPE,                         \
   6016             .type = QEMU_OPT_STRING,                                    \
   6017             .help = "Compression method used for image cluster "        \
   6018                     "compression",                                      \
   6019             .def_value_str = "zlib"                                     \
   6020         },
   6021         QCOW_COMMON_OPTIONS,
   6022         { /* end of list */ }
   6023     }
   6024 };
   6025 
   6026 static QemuOptsList qcow2_amend_opts = {
   6027     .name = "qcow2-amend-opts",
   6028     .head = QTAILQ_HEAD_INITIALIZER(qcow2_amend_opts.head),
   6029     .desc = {
   6030         BLOCK_CRYPTO_OPT_DEF_LUKS_STATE("encrypt."),
   6031         BLOCK_CRYPTO_OPT_DEF_LUKS_KEYSLOT("encrypt."),
   6032         BLOCK_CRYPTO_OPT_DEF_LUKS_OLD_SECRET("encrypt."),
   6033         BLOCK_CRYPTO_OPT_DEF_LUKS_NEW_SECRET("encrypt."),
   6034         BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."),
   6035         QCOW_COMMON_OPTIONS,
   6036         { /* end of list */ }
   6037     }
   6038 };
   6039 
   6040 static const char *const qcow2_strong_runtime_opts[] = {
   6041     "encrypt." BLOCK_CRYPTO_OPT_QCOW_KEY_SECRET,
   6042 
   6043     NULL
   6044 };
   6045 
   6046 BlockDriver bdrv_qcow2 = {
   6047     .format_name        = "qcow2",
   6048     .instance_size      = sizeof(BDRVQcow2State),
   6049     .bdrv_probe         = qcow2_probe,
   6050     .bdrv_open          = qcow2_open,
   6051     .bdrv_close         = qcow2_close,
   6052     .bdrv_reopen_prepare  = qcow2_reopen_prepare,
   6053     .bdrv_reopen_commit   = qcow2_reopen_commit,
   6054     .bdrv_reopen_commit_post = qcow2_reopen_commit_post,
   6055     .bdrv_reopen_abort    = qcow2_reopen_abort,
   6056     .bdrv_join_options    = qcow2_join_options,
   6057     .bdrv_child_perm      = bdrv_default_perms,
   6058     .bdrv_co_create_opts  = qcow2_co_create_opts,
   6059     .bdrv_co_create       = qcow2_co_create,
   6060     .bdrv_has_zero_init   = qcow2_has_zero_init,
   6061     .bdrv_co_block_status = qcow2_co_block_status,
   6062 
   6063     .bdrv_co_preadv_part    = qcow2_co_preadv_part,
   6064     .bdrv_co_pwritev_part   = qcow2_co_pwritev_part,
   6065     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
   6066 
   6067     .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
   6068     .bdrv_co_pdiscard       = qcow2_co_pdiscard,
   6069     .bdrv_co_copy_range_from = qcow2_co_copy_range_from,
   6070     .bdrv_co_copy_range_to  = qcow2_co_copy_range_to,
   6071     .bdrv_co_truncate       = qcow2_co_truncate,
   6072     .bdrv_co_pwritev_compressed_part = qcow2_co_pwritev_compressed_part,
   6073     .bdrv_make_empty        = qcow2_make_empty,
   6074 
   6075     .bdrv_snapshot_create   = qcow2_snapshot_create,
   6076     .bdrv_snapshot_goto     = qcow2_snapshot_goto,
   6077     .bdrv_snapshot_delete   = qcow2_snapshot_delete,
   6078     .bdrv_snapshot_list     = qcow2_snapshot_list,
   6079     .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
   6080     .bdrv_measure           = qcow2_measure,
   6081     .bdrv_get_info          = qcow2_get_info,
   6082     .bdrv_get_specific_info = qcow2_get_specific_info,
   6083 
   6084     .bdrv_save_vmstate    = qcow2_save_vmstate,
   6085     .bdrv_load_vmstate    = qcow2_load_vmstate,
   6086 
   6087     .is_format                  = true,
   6088     .supports_backing           = true,
   6089     .bdrv_change_backing_file   = qcow2_change_backing_file,
   6090 
   6091     .bdrv_refresh_limits        = qcow2_refresh_limits,
   6092     .bdrv_co_invalidate_cache   = qcow2_co_invalidate_cache,
   6093     .bdrv_inactivate            = qcow2_inactivate,
   6094 
   6095     .create_opts         = &qcow2_create_opts,
   6096     .amend_opts          = &qcow2_amend_opts,
   6097     .strong_runtime_opts = qcow2_strong_runtime_opts,
   6098     .mutable_opts        = mutable_opts,
   6099     .bdrv_co_check       = qcow2_co_check,
   6100     .bdrv_amend_options  = qcow2_amend_options,
   6101     .bdrv_co_amend       = qcow2_co_amend,
   6102 
   6103     .bdrv_detach_aio_context  = qcow2_detach_aio_context,
   6104     .bdrv_attach_aio_context  = qcow2_attach_aio_context,
   6105 
   6106     .bdrv_supports_persistent_dirty_bitmap =
   6107             qcow2_supports_persistent_dirty_bitmap,
   6108     .bdrv_co_can_store_new_dirty_bitmap = qcow2_co_can_store_new_dirty_bitmap,
   6109     .bdrv_co_remove_persistent_dirty_bitmap =
   6110             qcow2_co_remove_persistent_dirty_bitmap,
   6111 };
   6112 
   6113 static void bdrv_qcow2_init(void)
   6114 {
   6115     bdrv_register(&bdrv_qcow2);
   6116 }
   6117 
   6118 block_init(bdrv_qcow2_init);